def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph', cluster_method='kmeans', k=2): ''' Performs all of intrinsic (feature extraction, clustering etc.) and creates Passage objects for each passage in <txt_file>. Writes a CSV file out to <out_file> containing all the features of <txt_file> The CSV files can be read easily by R in order to create plots ''' f = file(txt_file, 'r') text = f.read() f.close() util = IntrinsicUtility() feature_names = [ 'average_word_length', 'average_sentence_length', 'stopword_percentage', 'punctuation_percentage', 'syntactic_complexity', 'avg_internal_word_freq_class', 'avg_external_word_freq_class' ] ext = FeatureExtractor(text) print 'Initialized extractor' # Note that passages don't know their ground truths yet passages = ext.get_passages(feature_names, atom_type) print 'Extracted passages' util.add_ground_truth_to_passages(passages, xml_file) feature_vecs = [p.features.values() for p in passages] # If just testing feature extraction, don't cluster passages if cluster_method != 'none': # Cluster the passages and set their confidences confidences = cluster(cluster_method, k, feature_vecs) for psg, conf in zip(passages, confidences): psg.set_plag_confidence(conf) f = file(out_file, 'wb') csv_writer = csv.writer(f) # Writes out the header for corresponding CSV csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names)) for p in passages: csv_writer.writerow(p.to_list(feature_names)) f.close() print 'Finished writing', out_file
def _default_stepwise_params(): features = FeatureExtractor.get_all_feature_function_names() cluster_type = 'outlier' k = 2 atom_type = 'nchars' n = 500 first_doc_num = 0 results = stepwise_feature_selection(features, cluster_type, k, atom_type, n, first_doc_num=first_doc_num) print results return results
def get_feature_sets(): ''' Returns a list containing every set of features we want to test. Since we want to test each feature individually, for example, we will return something like: [['feat1'], ['feat2'], ..., ['feat1', 'feat2']] ''' all_features = FeatureExtractor.get_all_feature_function_names() individual_features = [[feat] for feat in all_features] # Test all features as a feature set, as well all_sets = individual_features + [all_features] return all_sets
def compare_params(): ''' [('l1', 'auto', 0.59759576698869676, 'plagcomps/shared/../figures/roc1390881314.99.pdf'), ('l1', None, 0.60174204862821445, 'plagcomps/shared/../figures/roc1390881397.91.pdf'), ('l2', 'auto', 0.60095727893574291, 'plagcomps/shared/../figures/roc1390881480.62.pdf'), ('l2', None, 0.5977554082484301, 'plagcomps/shared/../figures/roc1390881563.36.pdf') ] ''' features = FeatureExtractor.get_all_feature_function_names() features = [f for f in features if 'unigram' not in f and 'trigram' not in f] cluster_type = 'outlier' atom_type = 'paragraph' start_doc = 0 ntrain = 100 ntest = 200 # Process the test set once test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, ntrain, ntest) # Options for Log regression regularization_options = ['l1', 'l2'] class_weight_options = ['auto', None] results = [] for regularization in regularization_options: for class_weight in class_weight_options: model = train(features, cluster_type, atom_type, ntrain, start_doc=start_doc, regularization=regularization, class_weight=class_weight) confidences = [x[1] for x in model.predict_proba(test_matrix)] path, auc = BaseUtility.draw_roc(actuals, confidences, combination='Using Combination') results.append((regularization, class_weight, auc, path)) print results print results return results
sys.path.append("../PyGene/") from pygene.prog import ProgOrganism from pygene.population import Population import sqlalchemy from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base url = "postgresql://%s:%s@%s" % (username, password, dbname) engine = sqlalchemy.create_engine(url) Base = declarative_base() Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() features = FeatureExtractor.get_all_feature_function_names() num_training = 50 num_testing = 500 starting_doc = 0 training_files = IntrinsicUtility().get_n_training_files( n=num_training, first_doc_num=starting_doc) test_files = IntrinsicUtility().get_n_training_files( n=num_testing, first_doc_num=starting_doc + num_training) cached_reduced_docs = {} cached_confidences = {} # set base values for globals atom_type, cluster_type = "paragraph", "kmeans" # a tiny batch of functions
sys.path.append("../PyGene/") from pygene.prog import ProgOrganism from pygene.population import Population import sqlalchemy from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base url = "postgresql://%s:%s@%s" % (username, password, dbname) engine = sqlalchemy.create_engine(url) Base = declarative_base() Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() features = FeatureExtractor.get_all_feature_function_names() num_training = 50 num_testing = 500 starting_doc = 0 training_files = IntrinsicUtility().get_n_training_files(n=num_training, first_doc_num=starting_doc) test_files = IntrinsicUtility().get_n_training_files(n=num_testing, first_doc_num=starting_doc + num_training) cached_reduced_docs = {} cached_confidences = {} # set base values for globals atom_type, cluster_type = "paragraph", "kmeans" # a tiny batch of functions def add(x,y): #print "add: x=%s y=%s" % (repr(x), repr(y)) try:
def get_pairwise_results(atom_type, cluster_type, n, min_len, feature_set=None, cheating=False, write_output=False): ''' Generates a table for the results of all feature pairs. ''' all_features = FeatureExtractor.get_all_feature_function_names() if not feature_set: feature_set = list(itertools.combinations(all_features, 2)) feature_set += [(x,x) for x in all_features] session = Session() values = [] results = {} for feature_pair in feature_set: if feature_pair[0] == feature_pair[1]: feature_pair = [feature_pair[0]] trial = _get_latest_trial(atom_type, cluster_type, n, min_len, list(feature_pair), cheating, session) if trial: results[tuple(feature_pair)] = round(trial.auc, 4) values.append(trial.auc) else: results[tuple(feature_pair)] = "n/a" mean = numpy.array(values).mean() stdev = numpy.array(values).std() columns = all_features rows = all_features cells = [] for feature_a in rows: row = [] for feature_b in columns: if feature_a == feature_b: row.append(results[tuple([feature_a])]) else: if (feature_a, feature_b) in results: row.append(results[(feature_a, feature_b)]) elif (feature_b, feature_a) in results: row.append(results[(feature_b, feature_a)]) else: row.append('???') cells.append(row) # Is html table the best way to view it? html = '<html><head></head><body>' html += '<h1>Pairwise Feature Results</h1>' html += '<p>DASHBOARD_VERSION = ' + str(DASHBOARD_VERSION) + '</p>' html += '<p>cheating = ' + str(cheating) + '</p>' html += '<p>atom_type = ' + str(atom_type) + '</p>' html += '<p>cluster_type = ' + str(cluster_type) + '</p>' html += '<p>n >= ' + str(n) + '</p>' html += '<p>min_len = ' + str(min_len) + '</p>' html += '<p>auc mean = ' + str(round(mean, 4)) + ', stdev = ' + str(round(stdev, 4)) + '</p>' html += '<table border="1">' html += '<tr>' html += '<td></td>' for feature in columns: html += '<td style="font-size: 0.7em">' + feature + '</td>' html += '</tr>' for i, feature_a in enumerate(rows, 0): html += '<tr>' html += '<td>' + feature_a + '</td>' for j, feature_b in enumerate(columns, 0): # set bg color of table cell to help visualize good features if type(cells[i][j]) == float: val = cells[i][j] z_score = (val - mean) / stdev if z_score > 3: bgcolor = '#00FF00' elif z_score > 2: bgcolor = '#AAFFAA' elif z_score > 1: bgcolor = '#DDFFDD' elif z_score > -1: bgcolor = '#FFFFFF' elif z_score > -2: bgcolor = '#FFDDDD' elif z_score > -3: bgcolor = '#FFAAAA' else: bgcolor = '#FF0000' else: bgcolor = '#888888' html += '<td style="background-color: ' + bgcolor + '">' + str(cells[i][j]) + '</td>' html += '</tr>' html += '</table></body></html>' if write_output: html_path = os.path.join(os.path.dirname(__file__), "../figures/dashboard_pairwise_table_"+str(DASHBOARD_VERSION)+"_"+str(time.time())+".html") with open(html_path, 'w') as f: f.write(html) print 'Saved pairwise feature table to ' + html_path return html
def all_k_sets_of_features(k=2): all_features = FeatureExtractor.get_all_feature_function_names() k_sets = [list(combo) for combo in itertools.combinations(all_features, k)] return k_sets