コード例 #1
0
def get_scections_from_text(txt, high_granularity=True):
    sections_to_keep_pattern = wiki_utils.get_seperator_foramt(
    ) if high_granularity else wiki_utils.get_seperator_foramt((1, 2))
    if not high_granularity:
        # if low granularity required we should flatten segments within segemnt level 2
        pattern_to_ommit = wiki_utils.get_seperator_foramt((3, 999))
        txt = re.sub(pattern_to_ommit, "", txt)

        #delete empty lines after re.sub()
        sentences = [
            s for s in txt.strip().split("\n") if len(s) > 0 and s != "\n"
        ]
        txt = '\n'.join(sentences).strip('\n')

    all_sections = re.split(sections_to_keep_pattern, txt)
    non_empty_sections = [s for s in all_sections if len(s) > 0]

    return non_empty_sections
コード例 #2
0
def load_data(file_source):
    #word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH+'/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
    cphrase = load_vectors(VECTORS)
    nlp = en_core_web_sm.load()

    with open(RESOURCES + 'stopwords.txt', 'r') as in_file:
        stop_words = in_file.read().splitlines()
        in_file.close()

    #boundaries = np.zeros(10000, 300)
    boundaries = []
    labels = []
    manual_stop = 50
    sample = 0
    exceptions = []  #not found in google news vectors
    separator = wiki_utils.get_seperator_foramt()

    for f, file in enumerate(get_files(file_source)):
        if f < manual_stop:
            with codecs.open(file, 'r', 'utf-8') as article:
                segments = [
                    s.strip('\n') for s in re.split(separator, article.read())
                ]
                #segments = [s for s in segments if len(s) > 0] - we dont need this as synthetic segments are guarnateed to be >=2
                new_sent = True  #force initialisation of first sentence container
                for segment in segments:
                    sentences = segment.split(
                        '\n'
                    )  #documents have already been sanitised and prepared with \n delimiters
                    #sentences = segment.splitlines()
                    exceptions.append([])  #this is just for auditing purposes
                    #sentences = article.split('\n') #should return the identical list
                    for s, line in enumerate(sentences):
                        if new_sent:  #do not append new sent unless we successfully built a sentence on the last pass
                            boundaries.append(np.zeros(300))
                            labels.append(0)  #default as a negative label
                            new_sent = False
                        else:
                            sample -= 1
                        sentence = re.sub(
                            '[^a-zA-Z0-9s,.]+', '', re.sub('-', ' ', line)
                        ).strip()  #.lower()#strip any non-alphanumerics
                        #dont lowercase yet spaCy ent tagger can make use of the Caps
                        exceptions[f].append(
                            [])  #this is just for auditing purposes
                        for token in nlp(sentence):
                            word = re.sub(
                                'W', '', token.text.lower()
                            )  #apostrophes (like don't) have been stripped from freq resource
                            cleansed = re.sub(
                                'd', '#',
                                re.sub("[^w']", '', token.text.lower())
                            )  #retain commas as word2vec includes don't etc.
                            if len(cleansed) > 0 and not word.isnumeric(
                            ) and word not in stop_words and token.lemma_ not in stop_words and (
                                    token.pos_ in CONTENT
                                    or token.ent_iob_ != 'O'):
                                try:
                                    boundaries[sample] += cphrase[
                                        cleansed]  #word2vec[cleansed]
                                except:
                                    exceptions[f][s].append(cleansed)
                                else:
                                    new_sent = True  #successfully embeddeded at least one sentence for this segment
                        sample += 1
                    if new_sent:
                        labels[
                            -1] = 1  #set positive label for last sentence in this segment
    return np.asarray(boundaries), np.asarray(labels)