def get_scections_from_text(txt, high_granularity=True): sections_to_keep_pattern = wiki_utils.get_seperator_foramt( ) if high_granularity else wiki_utils.get_seperator_foramt((1, 2)) if not high_granularity: # if low granularity required we should flatten segments within segemnt level 2 pattern_to_ommit = wiki_utils.get_seperator_foramt((3, 999)) txt = re.sub(pattern_to_ommit, "", txt) #delete empty lines after re.sub() sentences = [ s for s in txt.strip().split("\n") if len(s) > 0 and s != "\n" ] txt = '\n'.join(sentences).strip('\n') all_sections = re.split(sections_to_keep_pattern, txt) non_empty_sections = [s for s in all_sections if len(s) > 0] return non_empty_sections
def load_data(file_source): #word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH+'/word2vec/GoogleNews-vectors-negative300.bin', binary=True) cphrase = load_vectors(VECTORS) nlp = en_core_web_sm.load() with open(RESOURCES + 'stopwords.txt', 'r') as in_file: stop_words = in_file.read().splitlines() in_file.close() #boundaries = np.zeros(10000, 300) boundaries = [] labels = [] manual_stop = 50 sample = 0 exceptions = [] #not found in google news vectors separator = wiki_utils.get_seperator_foramt() for f, file in enumerate(get_files(file_source)): if f < manual_stop: with codecs.open(file, 'r', 'utf-8') as article: segments = [ s.strip('\n') for s in re.split(separator, article.read()) ] #segments = [s for s in segments if len(s) > 0] - we dont need this as synthetic segments are guarnateed to be >=2 new_sent = True #force initialisation of first sentence container for segment in segments: sentences = segment.split( '\n' ) #documents have already been sanitised and prepared with \n delimiters #sentences = segment.splitlines() exceptions.append([]) #this is just for auditing purposes #sentences = article.split('\n') #should return the identical list for s, line in enumerate(sentences): if new_sent: #do not append new sent unless we successfully built a sentence on the last pass boundaries.append(np.zeros(300)) labels.append(0) #default as a negative label new_sent = False else: sample -= 1 sentence = re.sub( '[^a-zA-Z0-9s,.]+', '', re.sub('-', ' ', line) ).strip() #.lower()#strip any non-alphanumerics #dont lowercase yet spaCy ent tagger can make use of the Caps exceptions[f].append( []) #this is just for auditing purposes for token in nlp(sentence): word = re.sub( 'W', '', token.text.lower() ) #apostrophes (like don't) have been stripped from freq resource cleansed = re.sub( 'd', '#', re.sub("[^w']", '', token.text.lower()) ) #retain commas as word2vec includes don't etc. if len(cleansed) > 0 and not word.isnumeric( ) and word not in stop_words and token.lemma_ not in stop_words and ( token.pos_ in CONTENT or token.ent_iob_ != 'O'): try: boundaries[sample] += cphrase[ cleansed] #word2vec[cleansed] except: exceptions[f][s].append(cleansed) else: new_sent = True #successfully embeddeded at least one sentence for this segment sample += 1 if new_sent: labels[ -1] = 1 #set positive label for last sentence in this segment return np.asarray(boundaries), np.asarray(labels)