def do_OCR(df, path_txt_files, min_size, list_files=None): """ NEEDS imagemagick and tesseract-ocr :param df: I need the complete df to get the language cause the program that does the OCR needs the supposed language of the image :param path_txt_files: :param min_size: :return: """ lang_map = {"en": "eng", "fr": "fra"} import os from subprocess import call if list_files: text_files = list_files else: text_files = get_texts(path_txt_files, "txt") for f in text_files: if os.path.getsize(f) < min_size or list_files: lang = df.loc[df["id"] == int(os.path.basename(f)[:-4])]["lang_title"].tolist()[0] call( "convert -density 300 {0}[0] -depth 8 -background white -alpha remove -flatten +matte {0}.tiff".format( f.replace(".txt", ".pdf")), shell=True) call("tesseract -l {0} {1}.tiff {2}".format(lang_map[lang], f.replace(".txt", ".pdf"), f[:-4]), shell=True) call("rm {0}.tiff".format(f.replace(".txt", ".pdf")), shell=True)
def do_OCR(df, path_txt_files, min_size, list_files=None): """ NEEDS imagemagick and tesseract-ocr :param df: I need the complete df to get the language cause the program that does the OCR needs the supposed language of the image :param path_txt_files: :param min_size: :return: """ lang_map = {"en": "eng", "fr": "fra"} import os from subprocess import call if list_files: text_files = list_files else: text_files = get_texts(path_txt_files, "txt") for f in text_files: if os.path.getsize(f) < min_size or list_files: lang = df.loc[df["id"] == int(os.path.basename(f) [:-4])]["lang_title"].tolist()[0] call( "convert -density 300 {0}[0] -depth 8 -background white -alpha remove -flatten +matte {0}.tiff" .format(f.replace(".txt", ".pdf")), shell=True) call("tesseract -l {0} {1}.tiff {2}".format( lang_map[lang], f.replace(".txt", ".pdf"), f[:-4]), shell=True) call("rm {0}.tiff".format(f.replace(".txt", ".pdf")), shell=True)
def pdf2txt(pdf_folder): from subprocess import call pdf_files = get_texts(pdf_folder, "pdf") for f in pdf_files: try: call("java -jar ../input/pdfs/pdfbox-app-1.8.5.jar ExtractText {0}".format(f), shell=True) logging.info("Extracted text from {}".format(f)) except: logging.warning("Could not extract text from {}".format(f)) pass
def pdf2txt(pdf_folder): from subprocess import call pdf_files = get_texts(pdf_folder, "pdf") for f in pdf_files: try: call( "java -jar ../input/pdfs/pdfbox-app-1.8.5.jar ExtractText {0}". format(f), shell=True) logging.info("Extracted text from {}".format(f)) except: logging.warning("Could not extract text from {}".format(f)) pass
def detect_garbage_text(text_path, min_chars=2): import re garbage_files = [] txt_files = get_texts(text_path, "txt") # txt_files = ["../input/pdfs/1page/1630.txt"] stop_words = set([l.strip("\n").decode("utf-8") for l in open("ngrams/stopwords.txt").readlines() if l]) for t in txt_files: with open(t, "r") as f: content = f.read().replace("\n", "") search_re = "\w{{{min},}}".format(min=str(min_chars)) words = re.findall(search_re, content) all_words = re.findall("\w+", content) if len(words) < 10 or len(stop_words.intersection(all_words)) < 5: logging.info("Garbage text!! File {}".format(t)) logging.debug(content) garbage_files.append(t) return garbage_files pass
def detect_garbage_text(text_path, min_chars=2): import re garbage_files = [] txt_files = get_texts(text_path, "txt") # txt_files = ["../input/pdfs/1page/1630.txt"] stop_words = set([ l.strip("\n").decode("utf-8") for l in open("ngrams/stopwords.txt").readlines() if l ]) for t in txt_files: with open(t, "r") as f: content = f.read().replace("\n", "") search_re = "\w{{{min},}}".format(min=str(min_chars)) words = re.findall(search_re, content) all_words = re.findall("\w+", content) if len(words) < 10 or len(stop_words.intersection(all_words)) < 5: logging.info("Garbage text!! File {}".format(t)) logging.debug(content) garbage_files.append(t) return garbage_files pass
def make_input_file(folder, test_ids=[]): # Create folder if necessary if not os.path.exists(folder): os.mkdir(folder) # Sample some publications and create the structure to pubs = PubTexts(n=100000) pub_ids = set(pubs.ids()) # Maps string id to incremental integer cited_ids_map # for pid in pubs.cited_ids_map(): # cited_ids_map[str(pid)] = len(cited_ids_map) # Ignore these cited_ids_map in the training files so they can be used for testing test_ids_set = set(test_ids) # Make citation file cited_ids_map = {} citations = defaultdict(list) rows = db.select(fields=["citing", "cited"], table="graph") for citing, cited in rows: # Convert to ascii string citing, cited = str(citing), str(cited) # Only include it if it's on sample set and it's not on test set if (citing in pub_ids) and (cited in pub_ids) and \ (citing not in test_ids_set) and (cited not in test_ids_set): # print "Adding" if cited not in cited_ids_map: cited_ids_map[str(cited)] = len(cited_ids_map) citations[str(citing)].append(cited_ids_map[str(cited)]) cits_per_pub = np.mean([len(c) for c in citations.values()]) print "%d citing pubs with %.2f citations per pub." % (len(citations), cits_per_pub) print "%d cited pubs." % len(cited_ids_map) citing = citations.keys() # citing_mask = [(pid in citing) for pid in pubs.ids()] vec = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.5, min_df=5, max_features=20000) texts = vec.fit_transform(pubs.texts(citing, use_title=True, use_abs=True)) # Write training file. Every row in the citing_tr.txt is the vector # representation of the text in the file. citations_file = open(os.path.join(folder, "citations_tr.txt"), "w") citing_file = open(os.path.join(folder, "citing_tr.txt"), "w") citing_ids_file = open(os.path.join(folder, "citing_tr_ids.txt"), "w") for i, pid in enumerate(citing): # Writing citing ids to map back on the search print >> citing_ids_file, str(pid) # Writing citations cited = sorted(citations[str(pid)]) print >> citations_file, len(cited), ' '.join(map(str, cited)) print_row(citing_file, texts[i]) citations_file.close() citing_file.close() citing_ids_file.close() # Release some memory del citations, citing, texts # Find vector representation for texts in the test set. This one we can't # fit, only transform, because it's test data (therefore unseen). texts = vec.transform(get_texts(test_ids, use_title=True, use_abs=True)) # Write the test files now. The citing contains term frequencies and the citations # is basically zeros, since it's actually what we are trying to predict. citations_file = open(os.path.join(folder, "citations_ts.txt"), "w") citing_file = open(os.path.join(folder, "citing_ts.txt"), "w") citing_ids_file = open(os.path.join(folder, "citing_ts_ids.txt"), "w") for i in xrange(len(test_ids)): print >> citing_ids_file, str(test_ids[i]) print >> citations_file, '0' print_row(citing_file, texts[i]) citations_file.close() citing_file.close() # Sort numeric cited_ids_map so that each line x corresponds to pub x str_ids, num_ids = zip(*cited_ids_map.items()) cited = np.asarray(str_ids)[list(num_ids)] texts = vec.fit_transform(pubs.texts(cited, use_title=True, use_abs=True)) cited_file = open(os.path.join(folder, "cited.txt"), "w") cited_ids_file = open(os.path.join(folder, "cited_ids.txt"), "w") # Writing terms frequency for i in xrange(len(cited)): # Writing cited ids to map back the searches print >> cited_ids_file, str(cited[i]) print_row(cited_file, texts[i]) cited_file.close() cited_ids_file.close() print "Done!"
def main(feature_type: str, language: str, domain: str, main_dir: str, seq_len: int, batch_size: int, lstm_dim: int, character_level: bool = False): """ Parameters ---------- feature_type: the name of the feature language: language of the text. main_dir: base directory seq_len: sequence length batch_size: batch size lstm_dim: lstm hidden dimension character_level: whether tokenizer should be on character level. """ texts = get_texts(main_dir, language, feature_type, character_level, domain) tokenizer = Tokenizer(texts.values(), character_level=character_level) samples = {} for book in texts: print(len(texts[book])) len_text = len(texts[book]) if character_level else len(texts[book].split()) if len_text < seq_len: logger.warn(f"Requested seq_len larger than text length: {len_text} / {seq_len} " f"for {book} and feature type {feature_type}.") continue rand_idx = np.random.randint(0, len_text - seq_len, batch_size) if character_level: samples[book] = tokenizer.encode([texts[book][i: i + seq_len] for i in rand_idx]) else: split_text = texts[book].split() samples[book] = tokenizer.encode( [" ".join(split_text[i: i + seq_len]) for i in rand_idx] ) test_generator = DataGenerator(tokenizer, tokenizer.full_text, seq_len=seq_len, batch_size=batch_size, with_embedding=True, train=False) sample_batch = next(iter(test_generator)) logger.info(f"X batch shape: {sample_batch[0].shape}, y batch shape: {sample_batch[1].shape}") logger.info(f"Sample batch text: {tokenizer.decode(sample_batch[0][0])}") file_path = os.path.join(main_dir, 'models', f'{feature_type}_{language}_lstm_{lstm_dim}') if domain: file_path += '_' + domain if character_level: file_path += '_character_level' file_path += '.h5' logger.info(f"Loading {file_path}") prediction_model = lstm_model(num_words=tokenizer.num_words, lstm_dim=lstm_dim, seq_len=1, batch_size=batch_size, stateful=True, return_state=True) prediction_model.load_weights(file_path) hiddens = {} seeds = {} predictions = {} for book in samples: seed = np.stack(samples[book]) print(seed.shape) hf, preds = generate_text(prediction_model, tokenizer, seed, get_hidden=True) print(hf.shape) hiddens[book] = hf seeds[book] = seed preds = [tokenizer.ix_to_word[pred] for pred in preds] predictions[book] = preds file_name = f'{feature_type}_{language}_lstm_{lstm_dim}_seq_len_{seq_len}' if domain: file_name += '_' + domain if character_level: file_name += '_character-level' file_name += '.pkl' path_out = os.path.join('data', 'hidden_states', file_name) with open(path_out, 'wb') as f: pickle.dump(hiddens, f) logger.info(f"Succesfully saved hidden dimensions to {path_out}") path_out = os.path.join('data', 'seeds', file_name) with open(path_out, 'wb') as f: pickle.dump(seeds, f) logger.info(f"Succesfully saved seeds to {path_out}") path_out = os.path.join('data', 'predictions', file_name) with open(path_out, 'wb') as f: pickle.dump(predictions, f) logger.info(f"Succesfully saved predictions to {path_out}")
def main(feature_type: str, language: str, domain: str, main_dir: str, seq_len: int, batch_size: int, test_batch_size: int, lstm_dim: int, character_level: bool = False): """ Parameters ---------- feature_type: the name of the feature main_dir: base directory language: language of corpus seq_len: sequence length batch_size: batch size test_batch_size: test batch size lstm_dim: lstm hidden dimension character_level: whether tokenizer should be on character level. """ texts = get_texts(main_dir, language, feature_type, character_level, domain) tokenizer = Tokenizer(texts.values(), character_level=character_level) train_generator = DataGenerator(tokenizer, tokenizer.full_text, seq_len=seq_len, batch_size=batch_size, with_embedding=True, train=True) test_generator = DataGenerator(tokenizer, tokenizer.full_text, seq_len=seq_len, batch_size=test_batch_size, with_embedding=True, train=False) sample_batch = next(iter(train_generator)) logger.info( f"X batch shape: {sample_batch[0].shape}, y batch shape: {sample_batch[1].shape}" ) logger.info(f"Sample batch text: {tokenizer.decode(sample_batch[0][0])}") training_model = lstm_model(num_words=tokenizer.num_words, seq_len=seq_len, lstm_dim=lstm_dim, stateful=False) file_path = os.path.join(main_dir, 'models', f'{feature_type}_{language}_lstm_{lstm_dim}') if domain: file_path += '_' + domain if character_level: file_path += '_character_level' file_path += '.h5' training_model.save_weights(file_path) checkpoint = tf.keras.callbacks.ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True) early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) generate_text = GenerateText(test_generator, tokenizer, file_path, lstm_dim) callbacks_list = [checkpoint, early_stopping, generate_text] training_model.fit_generator(train_generator, validation_data=test_generator, callbacks=callbacks_list, epochs=256)