def main(): Eng_side = 1 source_data = codecs.open(sys.argv[1], 'r', encoding='utf-8').readlines() target_data = codecs.open(sys.argv[2], 'r', encoding='utf-8').readlines() # align = [] for index, sent in enumerate(target_data): target_data[index], tmp = sent.split(" ||| ") # tmp = tmp.split() tmp = [int(a.split("-")[1]) for a in tmp] align.append(tmp) print ("S:", source_data[:5]) print ("T:", target_data[:5]) print (align[:5]) if Eng_side == 0: for sent in source_data: text = Text(sent) print (text.pos_tags) sys.exit() else: for sent in target_data: text = Text(sent) Noun_index = [] index = 0 for word, POS in text.pos_tags: if POS == "NOUN": Noun_index.append(index) index += 1 print (Noun_index) sys.exit()
def polyglot_entities(fileids=None, section=None, corpus=kddcorpus): """ Extract entities from each file using polyglot """ results = defaultdict(lambda: defaultdict(list)) fileids = fileids or corpus.fileids() for fileid in fileids: if section is not None: text = Text((list(sectpull([fileid], section=section))[0][1])) else: text = Text(corpus.raw(fileid)) for entity in text.entities: etext = " ".join(entity) if entity.tag == 'I-PER': key = 'persons' elif entity.tag == 'I-ORG': key = 'organizations' elif entity.tag == 'I-locations': key = 'locations' else: key = 'other' results[fileid][key].append(etext) return results
def extractEntities(self, text=None, filePath=None): if text and filePath: self.entity_logger.error( "Fatal error: text (string) and file-input cannot be input at the same time. Undefined behavior" ) sys.exit(0) if text: self.text = text self.text.replace("-", " - ") self.text.replace("-", " ") self.entities = Text(self.text, hint_language_code='no').entities self.entity_logger.info( message="Ekstraksjon av entiteter gjennomfort fra tekst-string" ) if filePath: if os.path.isfile(filePath): with open(filePath) as f: self.text = f.read() self.text.replace("-", " - ") self.entities = Text(self.text, hint_language_code='no').entities self.entity_logger.info( "Entities extracted from {}".format(filePath)) self.entity_logger.info( "Following entitites: {} was extracted from file: {}". format(self.entities, filePath)) self.formatEntities()
def get_named_entities(text, lang="tr"): ptext = Text(text, hint_language_code=lang) # output can be re-organised sentences = ptext.sentences entities = [] for sentence in sentences: lang_sentence = Text(str(sentence), hint_language_code=lang) s_entities = [(" ".join(entity), entity.tag) for entity in lang_sentence.entities] entities.extend(s_entities) return entities
def simplify(text, tips): if len(text.strip()) == 0 or text is None: return pgText = Text(text) pgText.hint_language_code = 'fr' pgText.pos_tags for word, pos in pgText.pos_tags: startpos = text.index(word) if pos == u'PRON': new_word = replace_propn(word) if new_word: print(word) index = len(warnings) warnings.append(Warning(index, startpos, startpos+len(word), "Utiliser mot simple", new_word)) # tips.append(Tip(C_COMPLEX_WORD, startpos, startpos + len(word), word)) elif pos == u'VERB': if word.startswith("l'") or word.startswith("d'"): verb = word.split("'")[1] else: verb = word new_word = replace_verb(verb) if new_word: index = len(warnings) warnings.append( Warning( index, startpos, startpos + len(word), "Utiliser:", lemma[verb] + ' -> ' + new_word ) ) for sentence in Text(text).sentences: sentstr = str(sentence) if sentstr.count(',') >= 1: idofcomma = sentstr.index(',') startidx = sentence.start-1+idofcomma index = len(warnings) # TODO: change message to French warnings.append( Warning( index, startidx, startidx + 5, "Avoid clauses", "Delete or use seperate sentences" ) )
def entity_strings(article): try: curr_entities = Text(article).entities except: #This is neccessary because polyglot has problems with some UTF-8 characters printable = ''.join(x for x in article if x.isprintable()) curr_entities = Text(printable).entities entity_strings = [] for entity in curr_entities: entity_str = extract_chunk_string(entity,article) entity_strings.append(entity_str) return entity_strings
def test_detection(test1, test2): """ This is for testing purposes only. """ ld = detect(test1) print("langdetect", ld) ld = detect(test2) print("langdetect", ld) text = Text(test1) pg = text.language.code print("polyglot", pg) text = Text(test2) pg = text.language.code print("polyglot", pg)
def tokenize(self, caption, lang, b_filter_url=True, b_filter_special=True, b_remove_stop=True, b_unique=True): caption = re.sub('[\s]', ' ', caption.lower(), flags=re.UNICODE) if lang=='en': caption = re.sub('[^\w\s@#]','',caption,flags=re.UNICODE) tokens = filter(lambda x: len(x) > 2, caption.strip().split(' ')) if b_filter_special: tokens = filter(lambda x: self.is_special(x) is not True, tokens) if b_filter_url: tokens = filter(lambda x: self.is_url(x) is not True, tokens) if b_remove_stop: tokens = filter(lambda x: x not in self.stop[lang], tokens) if b_unique: return list(set(tokens)) return tokens elif lang=='ar' or lang=='ru': try: if b_filter_special: caption = ' '.join(filter(lambda x: self.is_special(x)==False, caption.split(' '))) tokens = filter(lambda x: len(x)>1, Text(caption).words) if b_filter_url: tokens = filter(lambda x: self.is_url(x) is not True, tokens) if b_remove_stop: tokens = filter(lambda x: x not in self.stop[lang], tokens) if b_unique: return list(set(tokens)) return tokens except: traceback.print_exc() return [] else: return []
def identify_language(text): """ Identifies language from string using polyglot package. :param text: String to use for language identification :return: Language name (English) """ try: if text is not ('' or None): text = Text(text) language_code = text.language.code if language_code is not None: language_name = country.languages.get( alpha_2=language_code).name else: language_name = None else: language_name = None language_code = None except AttributeError as ae: language_name = None language_code = None return language_name
def get_quote_vector(entry, fast_text_models, enriched_collection): article = enriched_collection.find_one({"url": entry["source"]}) fast_text = fast_text_models[entry["language"]] cleaned_quote = get_cleaned_content(entry["quote"]) try: parsed = Text(cleaned_quote) tokens = [str(token).lower() for token in parsed.tokens] except pycld2.error as err: print(err) tokens = cleaned_quote.split() quote_vector = vectorize_tokens(tokens, fast_text) quote_vectors = [] for talker in entry["talker"]: entity_key = talker["entity"].lower().replace(".", "DOT") entity_tokens = [ token.lower() for token in article["cleaned_content_entities_parsed"][entity_key] ] entity_vector = vectorize_tokens(entity_tokens, fast_text) semantic_distance = cosine(quote_vector, entity_vector) quote_vectors.append(semantic_distance) return np.array(quote_vectors)
def transform(self, X): print("loading spacy model before transforming") #Evaluating if spacy spanish model exists nlp = spacy.load(self.model) print("evaluating sentiment2 model before transforming") #Evaluating if polyglot sentiment module exist try: Text("Evaluando existencia de sentiment2.es", hint_language_code="es").polarity except: #Destination where we will copy sentiment2 pickle #This code of exception is designed to work on google app engine with /code/ as python root #Comment it and install package manually for other platforms print("sentiment2 not found. Copying...") parent_dest = str(downloader.default_download_dir()) if not os.path.isdir(parent_dest + "/sentiment2/es"): createdPath1 = parent_dest + "/sentiment2/es" os.makedirs(createdPath1) copyfile( str(os.getcwd()) + "/processing/es.sent.pkl.tar.bz2", createdPath1 + "/es.sent.pkl.tar.bz2") print("copy finished") print("transforming") return numpy.concatenate( [singleSampleProcessPipeline(doc, nlp) for doc in X])
def determine_text_languages(string): input_object = Text(input) temp_list = [] for sentence in input_object.sentences: detect = Detector(str(sentence)) if len(temp_list) == 0: temp_list.append([sentence, detect.language.code]) else: if temp_list[-1][1] == detect.language.code: temp_list[-1][0] = temp_list[-1][0]+" "+sentence else: temp_list.append([sentence, detect.language.code]) new_list= [] for i in temp_list: new_list.append(i[0]) output = [] start = 0 end = 0 for sentence in new_list: detect = Detector(str(sentence)) end = start + len(sentence) position = (start, end) start = end + 1 output.append((detect.language.code, position, detect.language.confidence,)) return output
def preprocess(self, out_file, lc): """Tokenize, (lowercase,) sub-word split. Using Polyglot since it was used for JW300. Preprocess the source column of a dataframe object and write to file. Pipeline: - tokenize - split into sub-words Append pre-processed sources to dataframe.""" tokenized_sentences = [] bped_sentences = [] sources = [] with open(out_file, 'w') as ofile: for i, row in self._src_df.iterrows(): sentence_i = Text(row[0]).sentences[0] tokenized_sentence = "" bped_sentence = "" tokenized = " ".join(sentence_i.words) sources.append(str(sentence_i)) if lc: tokenized = tokenized.lower() tokenized_sentence = tokenized bped = self._bpe_model.process_line(tokenized) bped_sentence = bped ofile.write("{}\n".format(bped)) tokenized_sentences.append(tokenized_sentence) bped_sentences.append(bped_sentence) data = self._src_df.assign( tokenized_sentences=tokenized_sentences) data = data.assign( bped_sentences=bped_sentences) return data, sources
def Extract(self, request, context): text = request.text if text is None or not len(text.strip()): return entity_count = 0 for language in request.languages: if language not in LANGUAGES: continue try: parsed = Text(text, hint_language_code=language) for entity in parsed.entities: label = ' '.join(entity) label = CLEAN.sub(' ', label) label = collapse_spaces(label) if len(label) < 4 or len(label) > 200: continue if ' ' not in label: continue length = entity.end - entity.start entity_count += 1 yield ExtractedEntity(label=label, offset=entity.start, length=length, type=TYPES[entity.tag]) except Exception: log.exception("Cannot extract. Language: %s", language) log.info("Extract: extracted %s entities.", entity_count)
def polyglot_tags(self): with open(config.file_location, errors="ignore") as r: inputtext = r.read() tok = "" with open("punkttokenizer_fullcorpus.pickle", 'rb') as f: # using our custom tokenizer custom_sent_tokenizer = pickle.load(f) tokenized = custom_sent_tokenizer.tokenize(inputtext) tok += str(tokenized) text = Text(tok) if config.type_of_tag == "pos": print("\nPos tags using Polyglot\n") print(text.pos_tags) elif config.type_of_tag == "ner": print("\nNER tags using Polyglot\n") print(text.entities) elif config.type_of_tag == "mor": print("\nMorphemes using using Polyglot\n") # morphemes for cases where corpus is not formatted properly i.e. space between words are removed. eg "corporatebank" to "corporate" ,"bank" print(text.morphemes) else: print( "\n Please provide correct value to type_of_tag. Refer to the instructions\n" )
def get_score(row): text = Text(row.full_text) try: score = sum([word.polarity for word in text.words]) except ValueError as e: score = np.NaN return score
def polyglot_sentiment(): from polyglot.text import Text data = dict(default_data) data['message'] = "Sentiment Analysis API - POST only" data['sentiment'] = {} params = request.form # postdata if not params: data['error'] = 'Missing parameters' return jsonify(data) if not params['text']: data['error'] = 'Text parameter not found' return jsonify(data) if not 'lang' in params: language = 'en' # default language else: language = params['lang'] polyglot_text = Text(params['text'], hint_language_code=language) data['sentiment'] = polyglot_text.polarity return jsonify(data)
def find_entites(self, input_text): lang = self.lang_detect(input_text) def get_tagged_tuple(word, entities): for e in entities: if word in list(e): return (' '.join(e), e.tag) return (word, None) def remove_duplicates(l): seen = set() result = [] for item in l: if item not in seen: seen.add(item) result.append(item) return result if lang in self.__langs: text = Text(input_text) sents_res = [] for sent in text.sentences: word_tags = [] for wrd in sent.words: tagged_tuple = get_tagged_tuple(wrd, text.entities) word_tags.append(tagged_tuple) sents_res.append(remove_duplicates(word_tags)) res = {'lang': lang, 'ner_result': sents_res} return res else: raise LanguageNotAvailableError('Language is not available')
def use_polyglot(text): """ Input: string. Output: predicted language code. """ text = Text(text) plang = text.language.code return plang
async def get_words_and_phrases(text, text_language, user_language, user): sentences = list() processor = TextPreprocessor(CODES[text_language]) # TODO save to cache the processors t = Text(text) subscribed = mysql_connect.check_subscribed(user) if not subscribed: await bot.send_message(user, "/subscribe to get translations for your text sentences") for s in t.sentences: sent = BotSentence(s.start, s.end) # TODO paid feature translation = await bot_utils.get_definitions(text_language, user_language, s.string, user) sent.translation = translation key_words = processor.key_words(s.string) for kw in key_words: w = kw[0] if ' ' in w: sent.words.append(w) for word in s.words: word = str(word) if re.match(r"[^\w]+", word) is not None: continue sent.words.append(word) sentences.append(sent) return sentences
def getNERFromText(self, path, topicNumber, numberOfPages): allFiles = glob.glob(os.path.join(path, '*.txt')) allTopicPersonsFoundByNER = [] topicNumber = topicNumber - numberOfPages while (topicNumber < len(allFiles)): file = open(allFiles[topicNumber], "r") textOfTopic = file.readlines()[0] text = Text(textOfTopic, hint_language_code='hr') person = [] if (text): #print (text.sentences) #allKeyWords[:] += list(text.sentences) for entity in text.entities: #print (entity) if (entity.tag == "I-PER"): #print (entity) if (len(entity) >= 1): name = "" for en in entity: #print (en) name = name + en + " " name = name.strip(" ") if (name not in person): person.append(name) #print (person) allTopicPersonsFoundByNER.append(person) topicNumber += 1 return allTopicPersonsFoundByNER
def part_of_speech(text, list_n, list_adj, list_v): ############### # # 3つの品詞(名詞、形容詞、動詞)-とその他の品詞-を別々に抽出する関数 # # param >> text # param >> list_n: 名詞を格納するリスト # param >> list_adj: 形容詞を格納するリスト # param >> list_v: 動詞を格納するリスト # # return >> None # ############### for token in Text(text).pos_tags: #print(token) #特定の品詞のみ抽出 if u'NOUN' in token: list_n.append(token[0]) elif u'ADJ' in token: list_adj.append(token[0]) elif u'VERB' in token: if token[0].lower() != "do" and token[0].lower() != "doing" and token[0].lower() != "did" and \ token[0].lower() != "does" and token[0].lower() != "be" and token[0].lower() != "is" and \ token[0].lower() != "are" and token[0].lower() != "were" and token[0].lower() != "was" and \ token[0].lower() != "can": list_v.append(token[0]) elif u'AUX' in token: if token[0].lower() != "do" and token[0].lower() != "doing" and token[0].lower() != "did" and\ token[0].lower() != "does" and token[0].lower() != "be" and token[0].lower() != "is" and\ token[0].lower() != "are" and token[0].lower() != "were" and token[0].lower() != "was" and\ token[0].lower() != "can": list_v.append(token[0])
def analyze(doc): if doc not in entities: entities[doc] = [ "_".join(entity) for entity in Text(doc, hint_language_code="en").entities ] return entities[doc]
def nGramCount(text, n): """ Computes the count of the pos tag n-grams of the text supplied. Args: text (string): The text to be analysed for its' pos tag n-gram count n (int): Denotes the grams to extract language (string): Describes which language polyglot should expect to receive when analysing the supplied string Returns: dictionary: A dictionary with the n-gram as the key in the form of a tuple, and the count as the value """ if 'postags' not in os.listdir('.'): posTagging = Text(text, hint_language_code='da') posTagging = np.array( [x[-1].encode('utf-8') for x in posTagging.pos_tags]) np.savetxt('postags', posTagging, delimiter=',', fmt='%s') else: posTagging = np.loadtxt('postags', delimiter=',', dtype=str) new_posTagging = (tuple(posTagging[i:i + n][:]) for i in range(len(posTagging) - n + 1)) return Counter(new_posTagging)
def analyze(self, document): if document.type in [document.TYPE_TABULAR, document.TYPE_OTHER]: return collector = DocumentTagCollector(document, self.ORIGIN) text = document.text if text is None or len(text) <= self.MIN_LENGTH: return try: hint_language_code = None if len(document.languages) == 1: hint_language_code = document.languages[0] text = Text(text, hint_language_code=hint_language_code) for entity in text.entities: if entity.tag == 'I-LOC' or len(entity) == 1: continue label = ' '.join(entity) if len(label) < 4 or len(label) > 200: continue collector.emit(label, self.TYPES[entity.tag]) except ValueError as ve: log.info('NER value error: %r', ve) except Exception as ex: log.warning('NER failed: %r', ex) finally: log.info('Polyglot extracted %s entities.', len(collector)) collector.save()
def word_count(text, language_code='la'): text = Text(lemmatize(str(text)), hint_language_code=language_code) words = 0 for w in text.words: if w.polarity == score: words += 1 return words
def do_sentiment_analysis(): req_data = request.get_json() sentence = req_data['sentence'] time = req_data['time'] type_of_person = req_data['type_of_person'] group_chat_name = req_data['group_chat_name'] text = Text(sentence) sentiment = text.polarity cursor.execute( ''' INSERT INTO SentiAna.dbo.Sentiment_Analysis(Sentence, Sentiment, TypeOfPerson, GroupName) VALUES (?,?,?,?) ''', (sentence, sentiment, type_of_person, group_chat_name)) conn.commit() return ''' status_code:{}, status:{}, message:{} '''.format('200', 'successful', 'Analysis was successful')
def transliterate_to_arabic(text): # takes in a string words = re.findall(r"[\w']+|[.,،!?;]", text) # words = text.split(" ") # transliterate each word alone tr_words = [] for word in words: if is_arabic(word) or has_numbers(word) or word.isupper(): tr_words.append(word) continue if word == "" or word == "." or word == "," or word == "،": tr_words.append(word) continue word_poly = Text(word) try: tr_word = word_poly.transliterate("ar") except: word = word.encode("ascii", errors="ignore").decode() tr_words.append(word) continue for w in tr_word: if not w == "": tr_words.append(w) break # combine tr_text = "" for w in tr_words: tr_text += " " tr_text += w return tr_text[1:] # just to remove whitespace
def query(): persons=[] organizations=[] locations=[] article = request.args.get('article') exclude = set(string.punctuation) #remove_punctuation article=''.join(ch for ch in article if ch not in exclude) #remove_english_letters article=re.sub(r'[a-zA-Z?]', '',article).strip() text = Text(article) #language detection language=text.language.name #NER ner=text.entities for entity in ner: if entity.tag=='I-PER': edit_per=" ".join(entity) if edit_per not in persons: persons.append(edit_per) elif entity.tag=='I-ORG': edit_org=" ".join(entity) if edit_org not in organizations: organizations.append(edit_org)
def __call__(self, text): """Performs language detection, tokenization, and named entity recognition. Args: text(str): text. Returns: Dictionary that contains: 1. tokens - list of objects Token. 2. entities - list of objects TaggedSpan. 3. lang - str language of text. """ pg_text = Text(text) token_spans = self._tokenize(pg_text) pg_entities = pg_text.entities ann_entities = [] for pg_ent in pg_entities: tok_begin = token_spans[pg_ent.start] tok_end = token_spans[pg_ent.end - 1] ann_entities.append( ann.TaggedSpan(pg_ent.tag, tok_begin.begin, tok_end.end)) return { 'tokens': _make_tokens(token_spans, text), 'entities': ann_entities, 'lang': pg_text.language.code }