def find_usage_examples_from_summary( self, form: Form = None, ) -> List[UsageExample]: """This tries to find and clean sentences and return the shortest one""" if form is None: raise ValueError("form was None") logger = logging.getLogger(__name__) # find sentences # order in a list by length # pick the shortest one where the form representation appears logger.debug("Splitting the sentences using spaCy") nlp = Swedish() nlp.add_pipe('sentencizer') doc = nlp(self.text) sentences = set() raw_sentences = list(doc.sents) logger.info(f"Got {len(raw_sentences)} sentences from spaCy") for sentence in raw_sentences: #logger.info(sentence.text) # This is a very crude test for relevancy, we lower first to improve matching cleaned_sentence = sentence.text.lower() punctations = [".", ",", "!", "?", "„", "“", "\n"] for punctation in punctations: if punctation in cleaned_sentence: cleaned_sentence = cleaned_sentence.replace( punctation, " ") logger.debug(f"cleaned sentence:{cleaned_sentence}") if f" {form.representation.lower()} " in f" {cleaned_sentence} ": # Add to the set first to avoid duplicates sentences.add(sentence.text.replace("\n", "").strip()) logger.info( f"Found {len(sentences)} sentences which contained {form.representation}" ) examples = [] count_discarded = 0 for sentence in sentences: sentence_length = len(sentence.split(" ")) if (sentence_length > config.min_word_count and sentence_length < config.max_word_count): examples.append(UsageExample(sentence=sentence, record=self)) else: count_discarded += 1 if count_discarded > 0: logger.info( f"{count_discarded} sentence was discarded based on length") #print("debug exit") #exit(0) return examples
def input_fn(file_name): train_data = [] with open(file_name, 'r', encoding='utf-8') as f: heads = [] deps = [] annotations = [] texts = [] for line in f.readlines(): if re.match("^# text = ", line): texts.append(line.lstrip("# text = ").rstrip('\n')) elif re.match("^#", line): del line elif not line.strip() == "": sent = line.lstrip() lines = [line.split('\t') for line in sent.split('\n')][0] if lines[6] == "_": del lines else: heads.append(int(lines[6])) deps.append(lines[7]) elif line.strip() == "": annotations.append([heads, deps]) heads = [] deps = [] for i in range(len(annotations)): # Encode per-token tags following the BILUO scheme into entity offsets. text = texts[i] heads, deps = annotations[i][0], annotations[i][1] nlp = Swedish() spacy_doc = nlp(text) train_format = (str(spacy_doc), {'heads': (heads), 'deps': (deps)}) train_data.append(train_format) return train_data
def create_model(vectors_loc=None, lang=None, stz=True, vectors_name='fasttext', max_items=-1): if lang is None or lang == 'sv' and not stz: nlp = Swedish() elif not stz: nlp = spacy.blank(lang) elif stz: stanza.download(lang) snlp = stanza.Pipeline(lang=lang) nlp = StanzaLanguage(snlp) with open(vectors_loc, 'rb') as file_: logger.info("Reading file '{}'".format(vectors_loc)) header = file_.readline() nr_row, nr_dim = header.split( ) # the first line is number of tokens and dimensions counter = 0 nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: if counter % 100 == 0: logger.info(counter) if counter == max_items: break counter = counter + 1 line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = np.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab nlp.vocab.vectors.name = vectors_name # give vectors a name return nlp
def lemmatization_sv(text): """ Returnerar en lista med orden i givna strängen i grundform. """ nlp = Swedish() tokenized = nlp(text) lemmalized = [token.lemma_ for token in tokenized] lemmalized_words_string = ' '.join(lemmalized) return lemmalized_words_string
def sv_nlp(): return Swedish()
data_words_nostops = remove_stopwords(cleaned_list) wrdCloud = '' #strr=[] for row in data_words_nostops: # strr.append(",".join(row)) wrdCloud += " ".join(row) texts = "" for row in data_words_nostops: texts += ',' texts += ','.join(row) # spacy for lemmatization from spacy.lang.sv import Swedish nlp = Swedish() doc_lemmatized = nlp(texts) ##print('Tags', [(t.text) for t in doc_lemmatized]) ftext = [] for wrd in doc_lemmatized: coma = ',' if (wrd.text != ','): ftext.append(wrd.text) # words = " ".join(re.findall("[a-z\såäö]+", line[1])) # topic_words.append(words.split()) #RQ2 implementation start here retweeted = [] for row in tweet_list: if row.startswith('RT'): retweeted.append(row)
# ### How about other languages? # In[32]: doc = nlp("jag heter nils") displacy.render(doc, jupyter=True) # ### test swedish # In[33]: from spacy.lang.sv import Swedish nlp = Swedish() # use directly #nlp = spacy.blank("sv") # blank instance # ### basic tokenization enabled # - but not syntactic parsing # In[34]: doc = nlp("jag heter nils") for token in doc: print(token.text, token.pos_, token.dep_) #displacy.render(doc, jupyter=True)
def find_usage_examples_from_summary( self, form: Form = None, ) -> List[UsageExample]: """This tries to find and clean sentences and return the shortest one""" if form is None: raise ValueError("form was None") logger = logging.getLogger(__name__) # find sentences # order in a list by length # pick the shortest one where the form representation appears if self.language_code == WikimediaLanguageCode.ENGLISH: logger.info("using the English spaCy pipeline") nlp = English() nlp.add_pipe('sentencizer') elif self.language_code == WikimediaLanguageCode.SWEDISH: nlp = Swedish() nlp.add_pipe('sentencizer') elif (self.language_code == WikimediaLanguageCode.FRENCH or self.language_code == WikimediaLanguageCode.GERMAN or self.language_code == WikimediaLanguageCode.BOKMÅL or self.language_code == WikimediaLanguageCode.DANISH): logger.info( f"using the {self.language_code.name.title()} spaCy pipeline") try: nlp = spacy.load(f'{self.language_code.value}_core_news_sm') except: raise ModuleNotFoundError( f"Please install the spacy model for " f"{self.language_code.name.title()} by running: " f"'python -m spacy download " f"{self.language_code.value}_core_news_sm' " f"in the terminal/cmd/powershell") else: raise NotImplementedError( f"Sentence extraction for {self.language_code.name} " f"is not supported yet, feel free to open an issue at " f"https://github.com/dpriskorn/LexUtils/issues") doc = nlp(self.text) sentences = set() for sentence in doc.sents: # logger.info(sentence.text) # This is a very crude test for relevancy, we lower first to improve matching cleaned_sentence = sentence.text.lower() punctations = [".", ",", "!", "?", "„", "“", "»"] for punctation in punctations: if punctation in cleaned_sentence: cleaned_sentence = cleaned_sentence.replace( punctation, " ") cleaned_sentence = cleaned_sentence.strip() logger.debug(f"cleaned sentence:{cleaned_sentence}") if f" {form.representation.lower()} " in cleaned_sentence: # Add to the set first to avoid duplicates sentences.add(sentence.text) examples = [] for sentence in sentences: sentence_length = len(sentence.split(" ")) if (sentence_length > config.min_word_count and sentence_length < config.max_word_count): # Clean the sentence so it looks better punctations = ["„", "“", "»"] for punctation in punctations: if punctation in sentence: sentence = sentence.replace(punctation, " ") sentence = sentence.strip() examples.append(UsageExample(sentence=sentence, record=self)) # print("debug exit") # exit(0) return examples