Example #1
0
def do_OCR(df, path_txt_files, min_size, list_files=None):
    """
    NEEDS imagemagick and tesseract-ocr
    :param df: I need the complete df to get the language cause the program that does the OCR needs the supposed language
    of the image
    :param path_txt_files:
    :param min_size:
    :return:
    """
    lang_map = {"en": "eng", "fr": "fra"}
    import os
    from subprocess import call

    if list_files:
        text_files = list_files
    else:
        text_files = get_texts(path_txt_files, "txt")

    for f in text_files:
        if os.path.getsize(f) < min_size or list_files:
            lang = df.loc[df["id"] == int(os.path.basename(f)[:-4])]["lang_title"].tolist()[0]
            call(
                "convert -density 300 {0}[0] -depth 8 -background white -alpha remove -flatten +matte  {0}.tiff".format(
                f.replace(".txt", ".pdf")), shell=True)
            call("tesseract -l {0} {1}.tiff {2}".format(lang_map[lang], f.replace(".txt", ".pdf"), f[:-4]), shell=True)
            call("rm {0}.tiff".format(f.replace(".txt", ".pdf")), shell=True)
Example #2
0
def do_OCR(df, path_txt_files, min_size, list_files=None):
    """
    NEEDS imagemagick and tesseract-ocr
    :param df: I need the complete df to get the language cause the program that does the OCR needs the supposed language
    of the image
    :param path_txt_files:
    :param min_size:
    :return:
    """
    lang_map = {"en": "eng", "fr": "fra"}
    import os
    from subprocess import call

    if list_files:
        text_files = list_files
    else:
        text_files = get_texts(path_txt_files, "txt")

    for f in text_files:
        if os.path.getsize(f) < min_size or list_files:
            lang = df.loc[df["id"] == int(os.path.basename(f)
                                          [:-4])]["lang_title"].tolist()[0]
            call(
                "convert -density 300 {0}[0] -depth 8 -background white -alpha remove -flatten +matte  {0}.tiff"
                .format(f.replace(".txt", ".pdf")),
                shell=True)
            call("tesseract -l {0} {1}.tiff {2}".format(
                lang_map[lang], f.replace(".txt", ".pdf"), f[:-4]),
                 shell=True)
            call("rm {0}.tiff".format(f.replace(".txt", ".pdf")), shell=True)
Example #3
0
def pdf2txt(pdf_folder):
    from subprocess import call

    pdf_files = get_texts(pdf_folder, "pdf")

    for f in pdf_files:
        try:
            call("java -jar ../input/pdfs/pdfbox-app-1.8.5.jar ExtractText {0}".format(f), shell=True)
            logging.info("Extracted text from {}".format(f))
        except:
            logging.warning("Could not extract text from {}".format(f))
    pass
Example #4
0
def pdf2txt(pdf_folder):
    from subprocess import call

    pdf_files = get_texts(pdf_folder, "pdf")

    for f in pdf_files:
        try:
            call(
                "java -jar ../input/pdfs/pdfbox-app-1.8.5.jar ExtractText {0}".
                format(f),
                shell=True)
            logging.info("Extracted text from {}".format(f))
        except:
            logging.warning("Could not extract text from {}".format(f))
    pass
Example #5
0
def detect_garbage_text(text_path, min_chars=2):
    import re

    garbage_files = []
    txt_files = get_texts(text_path, "txt")
    # txt_files = ["../input/pdfs/1page/1630.txt"]
    stop_words = set([l.strip("\n").decode("utf-8") for l in open("ngrams/stopwords.txt").readlines() if l])
    for t in txt_files:
        with open(t, "r") as f:
            content = f.read().replace("\n", "")
            search_re = "\w{{{min},}}".format(min=str(min_chars))
            words = re.findall(search_re, content)
            all_words = re.findall("\w+", content)
            if len(words) < 10 or len(stop_words.intersection(all_words)) < 5:
                logging.info("Garbage text!! File {}".format(t))
                logging.debug(content)
                garbage_files.append(t)
    return garbage_files

    pass
Example #6
0
def detect_garbage_text(text_path, min_chars=2):
    import re

    garbage_files = []
    txt_files = get_texts(text_path, "txt")
    # txt_files = ["../input/pdfs/1page/1630.txt"]
    stop_words = set([
        l.strip("\n").decode("utf-8")
        for l in open("ngrams/stopwords.txt").readlines() if l
    ])
    for t in txt_files:
        with open(t, "r") as f:
            content = f.read().replace("\n", "")
            search_re = "\w{{{min},}}".format(min=str(min_chars))
            words = re.findall(search_re, content)
            all_words = re.findall("\w+", content)
            if len(words) < 10 or len(stop_words.intersection(all_words)) < 5:
                logging.info("Garbage text!! File {}".format(t))
                logging.debug(content)
                garbage_files.append(t)
    return garbage_files

    pass
def make_input_file(folder, test_ids=[]):

    # Create folder if necessary
    if not os.path.exists(folder):
        os.mkdir(folder)

    # Sample some publications and create the structure to
    pubs = PubTexts(n=100000)
    pub_ids = set(pubs.ids())

    # Maps string id to incremental integer cited_ids_map
    #	for pid in pubs.cited_ids_map():
    #		cited_ids_map[str(pid)] = len(cited_ids_map)

    # Ignore these cited_ids_map in the training files so they can be used for testing
    test_ids_set = set(test_ids)

    # Make citation file
    cited_ids_map = {}
    citations = defaultdict(list)
    rows = db.select(fields=["citing", "cited"], table="graph")
    for citing, cited in rows:

        # Convert to ascii string
        citing, cited = str(citing), str(cited)

        # Only include it if it's on sample set and it's not on test set
        if (citing in pub_ids) and (cited in pub_ids) and \
          (citing not in test_ids_set) and (cited not in test_ids_set):

            #			print "Adding"
            if cited not in cited_ids_map:
                cited_ids_map[str(cited)] = len(cited_ids_map)

            citations[str(citing)].append(cited_ids_map[str(cited)])

    cits_per_pub = np.mean([len(c) for c in citations.values()])
    print "%d citing pubs with %.2f citations per pub." % (len(citations),
                                                           cits_per_pub)
    print "%d cited pubs." % len(cited_ids_map)

    citing = citations.keys()
    #	citing_mask = [(pid in citing) for pid in pubs.ids()]

    vec = CountVectorizer(stop_words='english',
                          ngram_range=(1, 2),
                          max_df=0.5,
                          min_df=5,
                          max_features=20000)
    texts = vec.fit_transform(pubs.texts(citing, use_title=True, use_abs=True))

    # Write training file. Every row in the citing_tr.txt is the vector
    # representation of the text in the file.
    citations_file = open(os.path.join(folder, "citations_tr.txt"), "w")
    citing_file = open(os.path.join(folder, "citing_tr.txt"), "w")
    citing_ids_file = open(os.path.join(folder, "citing_tr_ids.txt"), "w")

    for i, pid in enumerate(citing):

        # Writing citing ids to map back on the search
        print >> citing_ids_file, str(pid)

        # Writing citations
        cited = sorted(citations[str(pid)])
        print >> citations_file, len(cited), ' '.join(map(str, cited))

        print_row(citing_file, texts[i])

    citations_file.close()
    citing_file.close()
    citing_ids_file.close()

    # Release some memory
    del citations, citing, texts

    # Find vector representation for texts in the test set. This one we can't
    # fit, only transform, because it's test data (therefore unseen).
    texts = vec.transform(get_texts(test_ids, use_title=True, use_abs=True))

    # Write the test files now. The citing contains term frequencies and the citations
    # is basically zeros, since it's actually what we are trying to predict.
    citations_file = open(os.path.join(folder, "citations_ts.txt"), "w")
    citing_file = open(os.path.join(folder, "citing_ts.txt"), "w")
    citing_ids_file = open(os.path.join(folder, "citing_ts_ids.txt"), "w")

    for i in xrange(len(test_ids)):

        print >> citing_ids_file, str(test_ids[i])
        print >> citations_file, '0'
        print_row(citing_file, texts[i])

    citations_file.close()
    citing_file.close()

    # Sort numeric cited_ids_map so that each line x corresponds to pub x
    str_ids, num_ids = zip(*cited_ids_map.items())
    cited = np.asarray(str_ids)[list(num_ids)]
    texts = vec.fit_transform(pubs.texts(cited, use_title=True, use_abs=True))

    cited_file = open(os.path.join(folder, "cited.txt"), "w")
    cited_ids_file = open(os.path.join(folder, "cited_ids.txt"), "w")

    # Writing terms frequency
    for i in xrange(len(cited)):

        # Writing cited ids to map back the searches
        print >> cited_ids_file, str(cited[i])

        print_row(cited_file, texts[i])

    cited_file.close()
    cited_ids_file.close()

    print "Done!"
Example #8
0
def main(feature_type: str, language: str, domain: str, main_dir: str, seq_len: int,
         batch_size: int, lstm_dim: int, character_level: bool = False):
    """
    Parameters
    ----------
    feature_type: the name of the feature
    language: language of the text.
    main_dir: base directory
    seq_len: sequence length
    batch_size: batch size
    lstm_dim: lstm hidden dimension
    character_level: whether tokenizer should be on character level.
    """

    texts = get_texts(main_dir, language, feature_type, character_level, domain)

    tokenizer = Tokenizer(texts.values(), character_level=character_level)

    samples = {}

    for book in texts:
        print(len(texts[book]))
        len_text = len(texts[book]) if character_level else len(texts[book].split())

        if len_text < seq_len:
            logger.warn(f"Requested seq_len larger than text length: {len_text} / {seq_len} "
                             f"for {book} and feature type {feature_type}.")
            continue
        rand_idx = np.random.randint(0, len_text - seq_len, batch_size)

        if character_level:
            samples[book] = tokenizer.encode([texts[book][i: i + seq_len] for i in rand_idx])

        else:
            split_text = texts[book].split()
            samples[book] = tokenizer.encode(
                [" ".join(split_text[i: i + seq_len]) for i in rand_idx]
            )

    test_generator = DataGenerator(tokenizer,
                                   tokenizer.full_text,
                                   seq_len=seq_len,
                                   batch_size=batch_size,
                                   with_embedding=True,
                                   train=False)

    sample_batch = next(iter(test_generator))

    logger.info(f"X batch shape: {sample_batch[0].shape}, y batch shape: {sample_batch[1].shape}")
    logger.info(f"Sample batch text: {tokenizer.decode(sample_batch[0][0])}")

    file_path = os.path.join(main_dir, 'models',
                             f'{feature_type}_{language}_lstm_{lstm_dim}')

    if domain:
        file_path += '_' + domain

    if character_level:
        file_path += '_character_level'

    file_path += '.h5'

    logger.info(f"Loading {file_path}")

    prediction_model = lstm_model(num_words=tokenizer.num_words,
                                  lstm_dim=lstm_dim,
                                  seq_len=1,
                                  batch_size=batch_size,
                                  stateful=True,
                                  return_state=True)

    prediction_model.load_weights(file_path)

    hiddens = {}
    seeds = {}
    predictions = {}

    for book in samples:
        seed = np.stack(samples[book])
        print(seed.shape)
        hf, preds = generate_text(prediction_model, tokenizer, seed, get_hidden=True)
        print(hf.shape)
        hiddens[book] = hf
        seeds[book] = seed
        preds = [tokenizer.ix_to_word[pred] for pred in preds]
        predictions[book] = preds

    file_name = f'{feature_type}_{language}_lstm_{lstm_dim}_seq_len_{seq_len}'

    if domain:
        file_name += '_' + domain

    if character_level:
        file_name += '_character-level'
    file_name += '.pkl'

    path_out = os.path.join('data', 'hidden_states', file_name)
    with open(path_out, 'wb') as f:
        pickle.dump(hiddens, f)

    logger.info(f"Succesfully saved hidden dimensions to {path_out}")

    path_out = os.path.join('data', 'seeds', file_name)
    with open(path_out, 'wb') as f:
        pickle.dump(seeds, f)
    logger.info(f"Succesfully saved seeds to {path_out}")

    path_out = os.path.join('data', 'predictions', file_name)
    with open(path_out, 'wb') as f:
        pickle.dump(predictions, f)

    logger.info(f"Succesfully saved predictions to {path_out}")
Example #9
0
def main(feature_type: str,
         language: str,
         domain: str,
         main_dir: str,
         seq_len: int,
         batch_size: int,
         test_batch_size: int,
         lstm_dim: int,
         character_level: bool = False):
    """
    Parameters
    ----------
    feature_type: the name of the feature
    main_dir: base directory
    language: language of corpus
    seq_len: sequence length
    batch_size: batch size
    test_batch_size: test batch size
    lstm_dim: lstm hidden dimension
    character_level: whether tokenizer should be on character level.
    """

    texts = get_texts(main_dir, language, feature_type, character_level,
                      domain)

    tokenizer = Tokenizer(texts.values(), character_level=character_level)

    train_generator = DataGenerator(tokenizer,
                                    tokenizer.full_text,
                                    seq_len=seq_len,
                                    batch_size=batch_size,
                                    with_embedding=True,
                                    train=True)

    test_generator = DataGenerator(tokenizer,
                                   tokenizer.full_text,
                                   seq_len=seq_len,
                                   batch_size=test_batch_size,
                                   with_embedding=True,
                                   train=False)

    sample_batch = next(iter(train_generator))

    logger.info(
        f"X batch shape: {sample_batch[0].shape}, y batch shape: {sample_batch[1].shape}"
    )
    logger.info(f"Sample batch text: {tokenizer.decode(sample_batch[0][0])}")

    training_model = lstm_model(num_words=tokenizer.num_words,
                                seq_len=seq_len,
                                lstm_dim=lstm_dim,
                                stateful=False)

    file_path = os.path.join(main_dir, 'models',
                             f'{feature_type}_{language}_lstm_{lstm_dim}')

    if domain:
        file_path += '_' + domain

    if character_level:
        file_path += '_character_level'

    file_path += '.h5'

    training_model.save_weights(file_path)

    checkpoint = tf.keras.callbacks.ModelCheckpoint(file_path,
                                                    monitor='val_loss',
                                                    save_best_only=True)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=2)

    generate_text = GenerateText(test_generator, tokenizer, file_path,
                                 lstm_dim)
    callbacks_list = [checkpoint, early_stopping, generate_text]

    training_model.fit_generator(train_generator,
                                 validation_data=test_generator,
                                 callbacks=callbacks_list,
                                 epochs=256)