def __lowercase_document(document_name: pathlib.Path) -> list:
    lines = u.read_document(document_name)
    result = []
    for line in lines:
        line = line.lower()
        result.append(line)
    return result
def __remove_punction_from_document(document_name: pathlib.Path) -> list:
    lines = u.read_document(document_name)
    result = []
    for line in lines:
        chars = [c for c in line if c not in string.punctuation]
        line = ''.join(chars)
        result.append(line)
    return result
Exemple #3
0
def __remove_numbers_from_document(document_name: pathlib.Path) -> list:
    lines = u.read_document(document_name)
    result = []
    for line in lines:
        chars = [c for c in line if not c.isnumeric()]
        line = ''.join(chars)
        result.append(line)
    return result
def __remove_stopwords_from_document(document_name: pathlib.Path,
                                     stopwords: set) -> list:
    lines = u.read_document(document_name)
    result = []
    for line in lines:
        words = [token for token in line.split(' ')]
        words = [word for word in words if word not in stopwords]
        sentence = ' '.join(words)
        result.append(sentence)
    return result
def __vectorise_document(document_name: pathlib.Path, token_map: dict) -> list:
    lines = u.read_document(document_name)
    vectors = []
    for line in lines:
        tokens = line.split()
        tokens = [token_map[token] for token in tokens]
        tokens = [str(token) for token in tokens]
        vector = ' '.join(tokens)
        vectors.append(vector)
    return vectors
def __tokenize_document(document_name: pathlib.Path) -> list:
    lines = u.read_document(document_name)
    result = []
    for line in lines:
        sentences = sent_tokenize.tokenize(line)
        for i in range(0, len(sentences)):
            sentence = sentences[i]
            words = word_tokenize.tokenize(sentence)
            sentences[i] = ' '.join(words)
        result.extend(sentences)
    return result
def __normalize_document_by_truncation(document_name: pathlib.Path,
                                       max_length: int) -> list:
    lines = u.read_document(document_name)
    results = []
    for line in lines:
        tokens = line.split()
        if len(tokens) > max_length:
            tokens = tokens[0:max_length]
        max_length = max_length - len(tokens)
        if len(tokens) > 0:
            results.append(' '.join(tokens))
    return results
def __document_to_sentences(document_name: pathlib.Path) -> list:
    lines = u.read_document(document_name)
    sentences = lines
    sentences = [sentence.strip() for sentence in sentences]
    sentences = [sentence for sentence in sentences if len(sentence) > 0]
    widgets = ['Calculating grade level: ', pb.Percentage(), ' ', pb.Bar(marker = '.', left = '[', right = ']'), ' ', pb.ETA()]
    with pb.ProgressBar(widgets = widgets, max_value = len(sentences)) as bar:
        for i in range(0, len(sentences)):
            bar.update(i)
            sentence = sentences[i]
            grade_level = __calculate_sentences_median_grade_level(sentence)
            sentences[i] = sentenceplus(sentence, grade_level)
    return sentences
    def load_temario(self, version):
        '''
        TEMARIO: NUmero de palabras o sentencias del resumo final
        CSTNews: 70% en numero de palabras del documento con mayor peso
        '''
        print "temario :)"
        corpus_dictionary = dict()

        if version == 'temario_v1':
            path = corpus_dir[version]
            path_sumarios = summaries_dir[version]
            documents = os.listdir(path)
            sumarios = os.listdir(path_sumarios)

            for i in documents:
                docPath = path + '/' + i
                # print docPath
                document_name = i[3:]
                document_name = document_name[:-4]

                document_sentences = read_document(docPath, self.language)
                class_labels_ml = None
                if self.dictionary_class_labels is not None:
                    class_labels_ml = self.dictionary_class_labels[
                        document_name]
                naive_tagged_sentences = naive_tag(
                    document_sentences, class_labels_ml
                )  # modificado para tambien etiquetar las sentencias q hacen parte del sumario o no fazen parte

                #print naive_tagged_sentences

                #corpus_dictionary[document_name] = [document_sentences]
                corpus_dictionary[document_name] = [naive_tagged_sentences]

            for i in sumarios:
                summPath = path_sumarios + i
                # print summPath
                summary_name = i[4:]
                summary_name = summary_name[:-4]
                size_summary = count_words(summPath, self.language)

                value = corpus_dictionary[summary_name]  # size_summary
                value.append(size_summary)
                corpus_dictionary[summary_name] = value

        else:
            print 'version 2'

        return corpus_dictionary
    def load_cst_news(self, version):
        print "cst news :)"
        corpus_dictionary = dict()
        if version == 'cstnews_v1':
            path = corpus_dir[version]
            clusters = os.listdir(path)
            special = '.DS_Store'
            if special in clusters: clusters.remove(special)
            for i in clusters:
                sub_path = path + i + '/' + corpus_dir['textosFonte']
                documents = os.listdir(sub_path)
                if special in documents: documents.remove(special)

                allSentences = []
                document_lenghts = []
                #top_sentences = []
                index = 1

                for j in documents:
                    document = sub_path + j
                    document_sentences = read_document(document, self.language)
                    class_labels_ml = None
                    if self.dictionary_class_labels is not None:
                        class_labels_ml = self.dictionary_class_labels[i]

                    #for k in  range(3):
                    #    top_sentences.append(document_sentences[k])

                    document_size = count_words(document, self.language)
                    document_lenghts.append(document_size)

                    taggedSentences = tag_sentence(document_sentences, index,
                                                   class_labels_ml)
                    #print taggedSentences

                    index += 1

                    allSentences.extend(taggedSentences)

                size_cluster = max(document_lenghts)
                size_summary = (30 * size_cluster) / 100
                #corpus_dictionary[i] = [allSentences, size_summary, top_sentences]
                corpus_dictionary[i] = [allSentences, size_summary]

        else:
            print 'version 2'

        # corpus = ['diccionario con nombres y los datos' ,'loaded corpus sin procesar' , 'vectores de sizes de sumarios']
        return corpus_dictionary
Exemple #11
0
                print("Skipping mapping: " + line)
    except IOError as e:
        print("The mappings could not be read ({0}): {1}".format(e.errno, e.strerror))
        sys.exit(-1)

    return mappings


if __name__ == "__main__":

    args = create_arguments_parser()
    print(args)

    path2index = args.directory

    doc = utils.read_document(path2index)
    if not args.no_checking:
        if not utils.verify_version(doc):
            print("Incorrect version used")
            sys.exit(-2)
        else:
            print("Correct version found")

    # read the mappings
    mappings = read_mappings(args.mapping)

    # check the keywords and update them if necessary
    check_keywords(doc)

    # process images
    process_images(path2index, doc, mappings)
rouge1_fscores_list = list()
rouge2_fscores_list = list()
rouge1_precisions_list = list()
rouge2_precisions_list = list()
rouge1_recalls_list = list()
rouge2_recalls_list = list()

w2v = util.read_word2vec_model()
directory = os.fsencode(util.DATA_PATH)
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    # sentences = read_document(filename)
    reference_summaries, K = util.read_single_ref_summaries(filename[:-4])

    sentence, _ = util.read_document(filename)
    word2vec = util.make_word_2_vec(sentence, w2v)

    candidate_set = np.array(list([*v] for k, v in word2vec.items()))
    try:
        rouge_1_fscore, rouge_2_fscore , rouge_1_precision, rouge_2_precision,\
            rouge_1_recall, rouge_2_recall = \
            util.evaluate(word2vec, NUMBER_SUMMARY_SET_ELEMENT, LAMBDA, TSTOP, MAX_CONSE_REJ, reference_summaries)
        rouge1_fscores_list.append(rouge_1_fscore)
        rouge2_fscores_list.append(rouge_2_fscore)
        rouge1_precisions_list.append(rouge_1_precision)
        rouge2_precisions_list.append(rouge_2_precision)
        rouge1_recalls_list.append(rouge_1_recall)
        rouge2_recalls_list.append(rouge_2_recall)
    except ValueError:
        print("Sample larger than population")
def __append_tokens(tokens: dict, document_name: pathlib.Path) -> None:
    lines = u.read_document(document_name)
    for line in lines:
        for token in line.split():
            cnt = tokens.get(token, 0)
            tokens[token] = cnt + 1
Exemple #14
0
# def apply_parser_v1(document):
#     sents = tokenize(document)
#     result = []
#     for sent in sents:
#         result.append(tagger_sync(sent))
#     # tagger_async(document, 8)
#     return result


def apply_parser_v2(document):
    data = ""
    try:
        data = fdg_parser_ro.parse_text(document)
    except Exception as e:
        print("Error at calling fdg_parser. Reason:", str(e))
    result = ""
    try:
        result = anaphora_resolution.solve_links_manual(data)
    except Exception as e:
        print("Error at anaphora resolution. Reason:", str(e))
    return result


if __name__ == '__main__':
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    data, document = u.read_document()
    print(apply_parser_v2(document))
NUMBER_SUMMARY_SET_ELEMENT = 5  # used for fixed K
LAMBDA = 0.1
TSTOP = 0.0001
MAX_CONSE_REJ = 100

rouge1_fscores_list = list()
rouge2_fscores_list = list()
rouge1_precisions_list = list()
rouge2_precisions_list = list()
rouge1_recalls_list = list()
rouge2_recalls_list = list()

directory = os.fsencode(util.DATA_PATH)
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    sentences, words = util.read_document(filename)
    reference_summaries, K = util.read_single_ref_summaries(filename[:-4])
    print("Summary Length : ", K)
    term_frequency = util.make_term_frequency(sentences, words)
    try:
        rouge_1_fscore, rouge_2_fscore , rouge_1_precision, rouge_2_precision,\
            rouge_1_recall, rouge_2_recall = \
            util.evaluate(term_frequency, NUMBER_SUMMARY_SET_ELEMENT, LAMBDA, TSTOP, MAX_CONSE_REJ, reference_summaries)
        rouge1_fscores_list.append(rouge_1_fscore)
        rouge2_fscores_list.append(rouge_2_fscore)
        rouge1_precisions_list.append(rouge_1_precision)
        rouge2_precisions_list.append(rouge_2_precision)
        rouge1_recalls_list.append(rouge_1_recall)
        rouge2_recalls_list.append(rouge_2_recall)
    except ValueError:
        print("Sample larger than population")
def __flatten_document(document_name: pathlib.Path) -> list:
    lines = u.read_document(document_name)
    return [' '.join(lines)]
Exemple #17
0
from datetime import datetime

data = []

fp = 'data/LRECjson/'
doc_count = 0
files = os.listdir(Path(fp))
start = datetime.now()
shuffle(files)
for jsonfile in files:
    #for jsonfile in ['../data/LRECjson/2018_1049.json']:
    doc_id = doc_count
    doc_count += 1
    path = str(fp + str(jsonfile))

    title, abstract, keywords, text = utils.read_document(path)

    if None in [title, abstract, keywords, text]:
        continue
    doc_data = utils.process_document(title,
                                      abstract,
                                      keywords,
                                      text,
                                      doc_id=doc_id,
                                      jsonfile=jsonfile,
                                      verbose=1)

    if doc_data is None:
        continue

    # downsample document ngram data
Exemple #18
0
for i in cluster:
    path_documents = path_documents_references + i + '/' + corpus_dir[
        'textosFonte']
    path_reference = path_documents_references + i + '/' + corpus_dir[
        'cst_extrato']

    documents = os.listdir(path_documents)
    cluster_sentences = []
    #print i
    prefix = i[:i.find('_')]
    path_summary = path_reference + prefix + corpus_dir['cst_extrato_name']
    reference_sentences = read_document_extract_cst(path_summary, 'ptg')

    for j in documents:
        sentences = read_document(path_documents + j, 'ptg')
        cluster_sentences.extend(sentences)

    auxi = 0
    dict_sentences = dict()
    for j in cluster_sentences:
        if j in reference_sentences:
            dict_sentences[j] = 0

    #print path_summary
    dictionary_cst[i] = dict_sentences

for i in dictionary_cst:
    print i

#write_data_to_disk(extras['PtgMDS_labels'], dictionary_cst)