def calculate_metadata_features(specs, pdf_folder): spec_filename = "features/metadata/specs_" + specs + ".csv" #num_top_words = specs.split("_")[0][:-1] #field_min_percent_occurrence = specs.split("_")[1][:-1] fields = set() words = {} with open(spec_filename, "r") as spec_file: spec_reader = csv.DictReader(spec_file) for line in spec_reader: field = line["field"].strip() word = line["word"].strip() fields.add(field) if field not in words: words[field] = set() words[field].add(word) paths = glob.glob(pdf_folder + "*.pdf") results = calculate_metadatas(paths) metadatas = {} for i in range(len(results)): path_id = utils.noext(paths[i]) metadatas[path_id] = {} for field in results[i]: safe_field = safify_field(field) if safe_field in fields: metadatas[path_id][safe_field] = Counter() for word in results[i][field].split(" "): if word in words[safe_field]: metadatas[path_id][safe_field][word] += 1 return (fields, words, metadatas)
def preprocess_text(pdf_folder, output_folder): pdf_paths = glob.glob(pdf_folder + "*.pdf") pdf_infos = [{ "plaintext_path": output_folder + utils.noext(pdf_path) + ".txt", "pdf_path": pdf_path } for pdf_path in pdf_paths] do_preprocessing(pdf_infos)
def read_preprocessed_textfiles(preprocessed_folder, valid_article_ids=None): #article_wcs is of the form {article_id: wcs} where wcs is a Counter with words #as keys and counts as values article_wcs = {} #vocab contains all words which occur in any document vocab = set() #total_wc is a Counter with words as keys and overall counts as values total_wc = Counter() #read the preprocessed text from file text_filenames = glob.glob(preprocessed_folder + "*.txt") one_percent = len(text_filenames) / 100.0 i = 0 for text_filename in text_filenames: article_id = utils.noext(text_filename) if valid_article_ids == None or article_id not in valid_article_ids: #count this one as already processed (since our caller didn't want results #from this file anyway) i += 1 continue if i % int(one_percent + 1) == 0: #add 1 to avoid div by 0 error for #one_percent < 1 percent_done = str(int(i / one_percent) + 1) print "Reading in preprocessed files: ", percent_done, "% complete\r", sys.stdout.flush() with open(text_filename, "r") as text_file: contents = text_file.read() no_special_chars = strip_specialchars(contents).lower() words = no_special_chars.split() #initialize the Counter for this article_id article_wcs[article_id] = Counter() for word in words: vocab.add(word) total_wc[word] += 1 article_wcs[article_id][word] += 1 i += 1 vocab = list(vocab) print return (article_wcs, vocab, total_wc)
def preprocess_img(dpi, pdf_folder, img_folder): pdf_paths = glob.glob(pdf_folder + "*.pdf") pdf_infos = [[img_folder + utils.noext(pdf_path), pdf_path, dpi] for pdf_path in pdf_paths] do_preprocessing(pdf_infos)
def calculate_img_features(specs, img_folder): paths = glob.glob(img_folder + "*.tif") results = do_feature_calculation(specs, paths) img_feats = {utils.noext(paths[i]): results[i] for i in range(len(paths))} return img_feats
#feature names in feature_names_lookup are prepended with the filename they came from #our features didn't come from a file but we know what the file names would be if they did: metadata_features_filenames = {fieldname: "features/metadata/" + metadata_feature_specs + \ "/feature_" + safify_field(fieldname) + ".csv" for fieldname in metadata_fields} word_features_filename = "features/words/feature_words" + str(word_feature_specs) + ".csv" img_features_filename = "features/img/feature_img_" + img_feature_specs + ".csv" i_s = [] j_s = [] data = [] #populate the feature matrix X #default feature values to 0 if no features were calculated for i in range(len(pdf_paths)): pdf_path = pdf_paths[i] pdf_id = utils.noext(pdf_path) #populate the feature matrix with metadata features if pdf_id in metadata_feats: for field in metadata_fields: feature_filename = metadata_features_filenames[field] if field in metadata_feats[pdf_id]: #the field does appear in the metadata for this PDF #so make features for each word in the field for word in metadata_feats[pdf_id][field]: feature_name = feature_filename + "_" + word j = feature_names_lookup[feature_name] feature_value = metadata_feats[pdf_id][field][word] if feature_value != 0: i_s.append(i) j_s.append(j) data.append(feature_value)
word_feats[article_id] = Counter() for word in article_wcs[article_id]: if word in words: word_feats[article_id][word] = article_wcs[article_id][word] return word_feats if __name__ == "__main__": #get a list of all article ids which we will calculate features for. We ignore #article ids which are not preprocessed or which have invalid versions all_article_ids = article_store.get_train_article_ids() article_ids_without_version = \ set([article_id for article_id in all_article_ids if article_store.get_version(article_id) == article_store.INVALID_VERSION]) preprocessed_article_ids = set([ utils.noext(filename) for filename in glob.glob("preprocessed/first_pages/*.txt") ]) article_ids = list((all_article_ids & preprocessed_article_ids) - article_ids_without_version) article_id_lookup = {article_ids[i]: i for i in range(len(article_ids))} #read in the text files from preprocessed/first_pages/ #supply the second argument to ignore test article ids which may have been preprocessed article_wcs, vocab, total_wc = \ read_preprocessed_textfiles("preprocessed/first_pages/", set(article_ids)) print "calculating features..." #only include words in the reduced_vocab if they occur a total of at least N #times in any document word_count_cutoff = 10