def calculate_metadata_features(specs, pdf_folder):
    spec_filename = "features/metadata/specs_" + specs + ".csv"
    #num_top_words = specs.split("_")[0][:-1]
    #field_min_percent_occurrence = specs.split("_")[1][:-1]

    fields = set()
    words = {}
    with open(spec_filename, "r") as spec_file:
        spec_reader = csv.DictReader(spec_file)
        for line in spec_reader:
            field = line["field"].strip()
            word = line["word"].strip()
            fields.add(field)
            if field not in words:
                words[field] = set()
            words[field].add(word)

    paths = glob.glob(pdf_folder + "*.pdf")
    results = calculate_metadatas(paths)

    metadatas = {}
    for i in range(len(results)):
        path_id = utils.noext(paths[i])
        metadatas[path_id] = {}
        for field in results[i]:
            safe_field = safify_field(field)
            if safe_field in fields:
                metadatas[path_id][safe_field] = Counter()
                for word in results[i][field].split(" "):
                    if word in words[safe_field]:
                        metadatas[path_id][safe_field][word] += 1
    return (fields, words, metadatas)
def preprocess_text(pdf_folder, output_folder):
    pdf_paths = glob.glob(pdf_folder + "*.pdf")
    pdf_infos = [{
        "plaintext_path": output_folder + utils.noext(pdf_path) + ".txt",
        "pdf_path": pdf_path
    } for pdf_path in pdf_paths]
    do_preprocessing(pdf_infos)
Example #3
0
def read_preprocessed_textfiles(preprocessed_folder, valid_article_ids=None):
    #article_wcs is of the form {article_id: wcs} where wcs is a Counter with words
    #as keys and counts as values
    article_wcs = {}
    #vocab contains all words which occur in any document
    vocab = set()
    #total_wc is a Counter with words as keys and overall counts as values
    total_wc = Counter()

    #read the preprocessed text from file
    text_filenames = glob.glob(preprocessed_folder + "*.txt")

    one_percent = len(text_filenames) / 100.0
    i = 0
    for text_filename in text_filenames:
        article_id = utils.noext(text_filename)
        if valid_article_ids == None or article_id not in valid_article_ids:
            #count this one as already processed (since our caller didn't want results
            #from this file anyway)
            i += 1
            continue
        if i % int(one_percent + 1) == 0:  #add 1 to avoid div by 0 error for
            #one_percent < 1
            percent_done = str(int(i / one_percent) + 1)
            print "Reading in preprocessed files: ", percent_done, "% complete\r",
        sys.stdout.flush()
        with open(text_filename, "r") as text_file:
            contents = text_file.read()
            no_special_chars = strip_specialchars(contents).lower()
            words = no_special_chars.split()
            #initialize the Counter for this article_id
            article_wcs[article_id] = Counter()
            for word in words:
                vocab.add(word)
                total_wc[word] += 1
                article_wcs[article_id][word] += 1
        i += 1

    vocab = list(vocab)
    print

    return (article_wcs, vocab, total_wc)
def preprocess_img(dpi, pdf_folder, img_folder):
    pdf_paths = glob.glob(pdf_folder + "*.pdf")
    pdf_infos = [[img_folder + utils.noext(pdf_path), pdf_path, dpi]
                 for pdf_path in pdf_paths]
    do_preprocessing(pdf_infos)
def calculate_img_features(specs, img_folder):
    paths = glob.glob(img_folder + "*.tif")
    results = do_feature_calculation(specs, paths)
    img_feats = {utils.noext(paths[i]): results[i] for i in range(len(paths))}
    return img_feats
#feature names in feature_names_lookup are prepended with the filename they came from
#our features didn't come from a file but we know what the file names would be if they did:
metadata_features_filenames = {fieldname: "features/metadata/" + metadata_feature_specs + \
                                          "/feature_" + safify_field(fieldname) + ".csv"
                               for fieldname in metadata_fields}
word_features_filename = "features/words/feature_words" + str(word_feature_specs) + ".csv"
img_features_filename = "features/img/feature_img_" + img_feature_specs + ".csv"

i_s = []
j_s = []
data = []
#populate the feature matrix X
#default feature values to 0 if no features were calculated
for i in range(len(pdf_paths)):
    pdf_path = pdf_paths[i]
    pdf_id = utils.noext(pdf_path)
    #populate the feature matrix with metadata features
    if pdf_id in metadata_feats:
        for field in metadata_fields:
            feature_filename = metadata_features_filenames[field]
            if field in metadata_feats[pdf_id]:
                #the field does appear in the metadata for this PDF
                #so make features for each word in the field
                for word in metadata_feats[pdf_id][field]:
                    feature_name = feature_filename + "_" + word
                    j = feature_names_lookup[feature_name]
                    feature_value = metadata_feats[pdf_id][field][word]
                    if feature_value != 0:
                        i_s.append(i)
                        j_s.append(j)
                        data.append(feature_value)
Example #7
0
        word_feats[article_id] = Counter()
        for word in article_wcs[article_id]:
            if word in words:
                word_feats[article_id][word] = article_wcs[article_id][word]
    return word_feats


if __name__ == "__main__":
    #get a list of all article ids which we will calculate features for. We ignore
    #article ids which are not preprocessed or which have invalid versions
    all_article_ids = article_store.get_train_article_ids()
    article_ids_without_version = \
            set([article_id for article_id in all_article_ids
                 if article_store.get_version(article_id) == article_store.INVALID_VERSION])
    preprocessed_article_ids = set([
        utils.noext(filename)
        for filename in glob.glob("preprocessed/first_pages/*.txt")
    ])
    article_ids = list((all_article_ids & preprocessed_article_ids) -
                       article_ids_without_version)
    article_id_lookup = {article_ids[i]: i for i in range(len(article_ids))}

    #read in the text files from preprocessed/first_pages/
    #supply the second argument to ignore test article ids which may have been preprocessed
    article_wcs, vocab, total_wc = \
            read_preprocessed_textfiles("preprocessed/first_pages/", set(article_ids))
    print "calculating features..."

    #only include words in the reduced_vocab if they occur a total of at least N
    #times in any document
    word_count_cutoff = 10