コード例 #1
0
def read_seg_file(path,
                  word2vec,
                  remove_preface_segment=True,
                  ignore_list=False,
                  remove_special_tokens=False,
                  return_as_sentences=False,
                  high_granularity=True,
                  only_letters=False,
                  max_token_num=0):
    data, targets = [], []
    all_sections = get_sections(path, high_granularity)

    for section in all_sections:
        sentences = section.split('\n')
        if not sentences: continue

        for sentence in sentences:
            tokens = sentence.split("\t")
            if len(tokens) != 2: continue
            loc, ocr = tokens
            sentence_words = ocr.split(" ")
            if 1 <= len(sentence_words):
                offset, token_num = 0, len(sentence_words)
                sent_data = []
                for i, word in enumerate(sentence_words):
                    word_embed = word_model(word, word2vec)
                    #loc_embed = location_model(loc, i, offset, token_num)
                    sent_data.append(word_embed)
                    offset += len(word)
                if max_token_num > 0: sent_data = sent_data[:max_token_num]
                data.append(sent_data)
        if data:
            targets.append(len(data) - 1)

    return data, targets, path
コード例 #2
0
def read_wiki_file(path,
                   word2vec,
                   remove_preface_segment=True,
                   ignore_list=False,
                   remove_special_tokens=False,
                   return_as_sentences=False,
                   high_granularity=True,
                   only_letters=False):
    data = []
    targets = []
    all_sections = get_sections(path, high_granularity)
    required_sections = all_sections[1:] if remove_preface_segment and len(
        all_sections) > 0 else all_sections
    required_non_empty_sections = [
        section for section in required_sections
        if len(section) > 0 and section != "\n"
    ]

    for section in required_non_empty_sections:
        sentences = section.split('\n')
        if sentences:
            for sentence in sentences:
                is_list_sentence = wiki_utils.get_list_token() + "." == str(
                    sentence)
                #print(is_list_sentence)
                if ignore_list and is_list_sentence:
                    # print(wiki_utils.get_list_token() + ".")
                    # print(sentence.encode('utf-8'))
                    continue
                if not return_as_sentences:
                    sentence_words = extract_sentence_words(
                        sentence, remove_special_tokens=remove_special_tokens)
                    if 1 <= len(sentence_words):
                        data.append([
                            word_model(word, word2vec)
                            for word in sentence_words
                        ])
                    else:
                        #print(sentence)
                        #print(sentence_words)
                        #raise ValueError('Sentence in wikipedia file is empty')
                        logger.info('Sentence in wikipedia file is empty')
                else:  # for the annotation. keep sentence as is.
                    if (only_letters):
                        sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence)
                        data.append(sentence)
                    else:
                        data.append(sentence)
            if data:
                targets.append(len(data) - 1)

    return data, targets, path
コード例 #3
0
def read_choi_file(path,
                   word2vec,
                   sent_bert_vec,
                   train,
                   return_w2v_tensors=True,
                   manifesto=False):
    seperator = '========' if manifesto else '=========='
    with Path(path).open('r') as f:
        raw_text = f.read()
    paragraphs = [
        clean_paragraph(p) for p in raw_text.strip().split(seperator)
        if len(p) > 5 and p != "\n"
    ]
    if train:
        random.shuffle(paragraphs)

    targets = []
    new_text = []
    text = []
    lastparagraphsentenceidx = 0

    for paragraph in paragraphs:
        if manifesto:
            sentences = split_sentences(paragraph, 0)
        else:
            sentences = [
                s for s in paragraph.split('\n') if len(s.split()) > 0
            ]

        if sentences:
            sentences_count = 0
            # This is the number of sentences in the paragraph and where we need to split.
            for sentence in sentences:
                words, sentence_str = extract_sentence_words(sentence)
                if (len(words) == 0):
                    continue
                sentences_count += 1
                if return_w2v_tensors:
                    text.append(words)
                    new_text.append([word_model(w, word2vec) for w in words])
                else:
                    text.append(words)
                    new_text.append(words)

            lastparagraphsentenceidx += sentences_count
            targets.append(lastparagraphsentenceidx - 1)

    return new_text, targets, path, sent_bert_vec