def __lowercase_document(document_name: pathlib.Path) -> list: lines = u.read_document(document_name) result = [] for line in lines: line = line.lower() result.append(line) return result
def __remove_punction_from_document(document_name: pathlib.Path) -> list: lines = u.read_document(document_name) result = [] for line in lines: chars = [c for c in line if c not in string.punctuation] line = ''.join(chars) result.append(line) return result
def __remove_numbers_from_document(document_name: pathlib.Path) -> list: lines = u.read_document(document_name) result = [] for line in lines: chars = [c for c in line if not c.isnumeric()] line = ''.join(chars) result.append(line) return result
def __remove_stopwords_from_document(document_name: pathlib.Path, stopwords: set) -> list: lines = u.read_document(document_name) result = [] for line in lines: words = [token for token in line.split(' ')] words = [word for word in words if word not in stopwords] sentence = ' '.join(words) result.append(sentence) return result
def __vectorise_document(document_name: pathlib.Path, token_map: dict) -> list: lines = u.read_document(document_name) vectors = [] for line in lines: tokens = line.split() tokens = [token_map[token] for token in tokens] tokens = [str(token) for token in tokens] vector = ' '.join(tokens) vectors.append(vector) return vectors
def __tokenize_document(document_name: pathlib.Path) -> list: lines = u.read_document(document_name) result = [] for line in lines: sentences = sent_tokenize.tokenize(line) for i in range(0, len(sentences)): sentence = sentences[i] words = word_tokenize.tokenize(sentence) sentences[i] = ' '.join(words) result.extend(sentences) return result
def __normalize_document_by_truncation(document_name: pathlib.Path, max_length: int) -> list: lines = u.read_document(document_name) results = [] for line in lines: tokens = line.split() if len(tokens) > max_length: tokens = tokens[0:max_length] max_length = max_length - len(tokens) if len(tokens) > 0: results.append(' '.join(tokens)) return results
def __document_to_sentences(document_name: pathlib.Path) -> list: lines = u.read_document(document_name) sentences = lines sentences = [sentence.strip() for sentence in sentences] sentences = [sentence for sentence in sentences if len(sentence) > 0] widgets = ['Calculating grade level: ', pb.Percentage(), ' ', pb.Bar(marker = '.', left = '[', right = ']'), ' ', pb.ETA()] with pb.ProgressBar(widgets = widgets, max_value = len(sentences)) as bar: for i in range(0, len(sentences)): bar.update(i) sentence = sentences[i] grade_level = __calculate_sentences_median_grade_level(sentence) sentences[i] = sentenceplus(sentence, grade_level) return sentences
def load_temario(self, version): ''' TEMARIO: NUmero de palabras o sentencias del resumo final CSTNews: 70% en numero de palabras del documento con mayor peso ''' print "temario :)" corpus_dictionary = dict() if version == 'temario_v1': path = corpus_dir[version] path_sumarios = summaries_dir[version] documents = os.listdir(path) sumarios = os.listdir(path_sumarios) for i in documents: docPath = path + '/' + i # print docPath document_name = i[3:] document_name = document_name[:-4] document_sentences = read_document(docPath, self.language) class_labels_ml = None if self.dictionary_class_labels is not None: class_labels_ml = self.dictionary_class_labels[ document_name] naive_tagged_sentences = naive_tag( document_sentences, class_labels_ml ) # modificado para tambien etiquetar las sentencias q hacen parte del sumario o no fazen parte #print naive_tagged_sentences #corpus_dictionary[document_name] = [document_sentences] corpus_dictionary[document_name] = [naive_tagged_sentences] for i in sumarios: summPath = path_sumarios + i # print summPath summary_name = i[4:] summary_name = summary_name[:-4] size_summary = count_words(summPath, self.language) value = corpus_dictionary[summary_name] # size_summary value.append(size_summary) corpus_dictionary[summary_name] = value else: print 'version 2' return corpus_dictionary
def load_cst_news(self, version): print "cst news :)" corpus_dictionary = dict() if version == 'cstnews_v1': path = corpus_dir[version] clusters = os.listdir(path) special = '.DS_Store' if special in clusters: clusters.remove(special) for i in clusters: sub_path = path + i + '/' + corpus_dir['textosFonte'] documents = os.listdir(sub_path) if special in documents: documents.remove(special) allSentences = [] document_lenghts = [] #top_sentences = [] index = 1 for j in documents: document = sub_path + j document_sentences = read_document(document, self.language) class_labels_ml = None if self.dictionary_class_labels is not None: class_labels_ml = self.dictionary_class_labels[i] #for k in range(3): # top_sentences.append(document_sentences[k]) document_size = count_words(document, self.language) document_lenghts.append(document_size) taggedSentences = tag_sentence(document_sentences, index, class_labels_ml) #print taggedSentences index += 1 allSentences.extend(taggedSentences) size_cluster = max(document_lenghts) size_summary = (30 * size_cluster) / 100 #corpus_dictionary[i] = [allSentences, size_summary, top_sentences] corpus_dictionary[i] = [allSentences, size_summary] else: print 'version 2' # corpus = ['diccionario con nombres y los datos' ,'loaded corpus sin procesar' , 'vectores de sizes de sumarios'] return corpus_dictionary
print("Skipping mapping: " + line) except IOError as e: print("The mappings could not be read ({0}): {1}".format(e.errno, e.strerror)) sys.exit(-1) return mappings if __name__ == "__main__": args = create_arguments_parser() print(args) path2index = args.directory doc = utils.read_document(path2index) if not args.no_checking: if not utils.verify_version(doc): print("Incorrect version used") sys.exit(-2) else: print("Correct version found") # read the mappings mappings = read_mappings(args.mapping) # check the keywords and update them if necessary check_keywords(doc) # process images process_images(path2index, doc, mappings)
rouge1_fscores_list = list() rouge2_fscores_list = list() rouge1_precisions_list = list() rouge2_precisions_list = list() rouge1_recalls_list = list() rouge2_recalls_list = list() w2v = util.read_word2vec_model() directory = os.fsencode(util.DATA_PATH) for file in os.listdir(directory): filename = os.fsdecode(file) # sentences = read_document(filename) reference_summaries, K = util.read_single_ref_summaries(filename[:-4]) sentence, _ = util.read_document(filename) word2vec = util.make_word_2_vec(sentence, w2v) candidate_set = np.array(list([*v] for k, v in word2vec.items())) try: rouge_1_fscore, rouge_2_fscore , rouge_1_precision, rouge_2_precision,\ rouge_1_recall, rouge_2_recall = \ util.evaluate(word2vec, NUMBER_SUMMARY_SET_ELEMENT, LAMBDA, TSTOP, MAX_CONSE_REJ, reference_summaries) rouge1_fscores_list.append(rouge_1_fscore) rouge2_fscores_list.append(rouge_2_fscore) rouge1_precisions_list.append(rouge_1_precision) rouge2_precisions_list.append(rouge_2_precision) rouge1_recalls_list.append(rouge_1_recall) rouge2_recalls_list.append(rouge_2_recall) except ValueError: print("Sample larger than population")
def __append_tokens(tokens: dict, document_name: pathlib.Path) -> None: lines = u.read_document(document_name) for line in lines: for token in line.split(): cnt = tokens.get(token, 0) tokens[token] = cnt + 1
# def apply_parser_v1(document): # sents = tokenize(document) # result = [] # for sent in sents: # result.append(tagger_sync(sent)) # # tagger_async(document, 8) # return result def apply_parser_v2(document): data = "" try: data = fdg_parser_ro.parse_text(document) except Exception as e: print("Error at calling fdg_parser. Reason:", str(e)) result = "" try: result = anaphora_resolution.solve_links_manual(data) except Exception as e: print("Error at anaphora resolution. Reason:", str(e)) return result if __name__ == '__main__': import sys reload(sys) sys.setdefaultencoding('utf8') data, document = u.read_document() print(apply_parser_v2(document))
NUMBER_SUMMARY_SET_ELEMENT = 5 # used for fixed K LAMBDA = 0.1 TSTOP = 0.0001 MAX_CONSE_REJ = 100 rouge1_fscores_list = list() rouge2_fscores_list = list() rouge1_precisions_list = list() rouge2_precisions_list = list() rouge1_recalls_list = list() rouge2_recalls_list = list() directory = os.fsencode(util.DATA_PATH) for file in os.listdir(directory): filename = os.fsdecode(file) sentences, words = util.read_document(filename) reference_summaries, K = util.read_single_ref_summaries(filename[:-4]) print("Summary Length : ", K) term_frequency = util.make_term_frequency(sentences, words) try: rouge_1_fscore, rouge_2_fscore , rouge_1_precision, rouge_2_precision,\ rouge_1_recall, rouge_2_recall = \ util.evaluate(term_frequency, NUMBER_SUMMARY_SET_ELEMENT, LAMBDA, TSTOP, MAX_CONSE_REJ, reference_summaries) rouge1_fscores_list.append(rouge_1_fscore) rouge2_fscores_list.append(rouge_2_fscore) rouge1_precisions_list.append(rouge_1_precision) rouge2_precisions_list.append(rouge_2_precision) rouge1_recalls_list.append(rouge_1_recall) rouge2_recalls_list.append(rouge_2_recall) except ValueError: print("Sample larger than population")
def __flatten_document(document_name: pathlib.Path) -> list: lines = u.read_document(document_name) return [' '.join(lines)]
from datetime import datetime data = [] fp = 'data/LRECjson/' doc_count = 0 files = os.listdir(Path(fp)) start = datetime.now() shuffle(files) for jsonfile in files: #for jsonfile in ['../data/LRECjson/2018_1049.json']: doc_id = doc_count doc_count += 1 path = str(fp + str(jsonfile)) title, abstract, keywords, text = utils.read_document(path) if None in [title, abstract, keywords, text]: continue doc_data = utils.process_document(title, abstract, keywords, text, doc_id=doc_id, jsonfile=jsonfile, verbose=1) if doc_data is None: continue # downsample document ngram data
for i in cluster: path_documents = path_documents_references + i + '/' + corpus_dir[ 'textosFonte'] path_reference = path_documents_references + i + '/' + corpus_dir[ 'cst_extrato'] documents = os.listdir(path_documents) cluster_sentences = [] #print i prefix = i[:i.find('_')] path_summary = path_reference + prefix + corpus_dir['cst_extrato_name'] reference_sentences = read_document_extract_cst(path_summary, 'ptg') for j in documents: sentences = read_document(path_documents + j, 'ptg') cluster_sentences.extend(sentences) auxi = 0 dict_sentences = dict() for j in cluster_sentences: if j in reference_sentences: dict_sentences[j] = 0 #print path_summary dictionary_cst[i] = dict_sentences for i in dictionary_cst: print i #write_data_to_disk(extras['PtgMDS_labels'], dictionary_cst)