def correct_case(value: str) -> str: """Correct the case in uppercase only text. Normalize all uppercase text, changing it to lower case but capitalizing correctly the start of sentences, proper nouns, etc. """ if value.upper() == value: return truecase.get_true_case(value) elif value.lower() == value: return truecase.get_true_case(value) return value
def extract_pos_features(segments): """ :param segments: List of text segments. Each segment is a string. :return: feats_dict: Dictionary mapping feature name to value for transcript Note: POS is more accurate for segments that contain capitalization, so if text is fully lowercase this function will try to infer the true casing before POS detection. """ # initialize feature dictionary with POS types feats_dict = dict((key, 0) for key in POS_KEY_LIST) num_words = 0 # add POS count features for segment in segments: lowercase = segment.islower() # split up into words segment = segment.split(" ") num_words += len(segment) if lowercase: # if text has been lowercased, # transform to true case (i.e. capitalize if supposed to be), so that POS tagger works better segment_str = " ".join(segment) truecase_str = truecase.get_true_case(segment_str) segment = truecase_str.split(" ") if '' in set(segment): segment[:] = [w for w in segment if w != ''] pos_seg = nltk.pos_tag(segment) for word, tag in pos_seg: update_feature_vals(tag, feats_dict) get_pos_ratios(feats_dict) # convert counts to proportions for key in POS_KEY_LIST: count = float(feats_dict[key]) feats_dict[key] = count / float(num_words) return feats_dict
def _normalize_text(self, text: str) -> str: normalized_text = truecase.get_true_case(text) # Fix some punctuation issue, e.g. `roughly$ 19` bececomes `roughly $19` normalized_text = re.sub( r"([#$(<[{]) ", lambda x: f" {x.group(1)}", normalized_text ) return normalized_text
def extract_keywords(self, title: str, graf: str): keywords = set([]) location = None sents = [title] + sent_tokenize(graf) for sent in sents: doc = self.nlp(truecase.get_true_case(sent)) for ent in doc.ents: if ent.label_ == "GPE" and not location: location = ent.text phrase = self.remove_stopwords(ent.text.lower()) if phrase: keywords.add(phrase) for chunk in doc.noun_chunks: phrase = self.remove_stopwords(chunk.text.lower()) if phrase: keywords.add(phrase) if len(keywords) >= 10: break return keywords, location # test # extractor = KeywordExtractor() # print(extractor("The state attorneys general in more than a dozen states are preparing to begin an antitrust investigation of the tech giants, according to two people briefed on the discussions, increasing pressure on the companies. The social media companies removed accounts and said they were sowing divisive messages about the Hong Kong protests."))
def postprocess_text(self, text): """Postprocesses and prepares the output summarized text""" # Lowercase everything text = self.truecasing_by_sentence_segmentation(text) text = truecase.get_true_case(text) text = self.fix_summary(text) return text
def correct_output(in_file,out_file): PUNCTUATION_MAPPING = {" !EXCLAMATIONMARK" : "!", " .PERIOD" : ".", " :COLON": ":", " ,COMMA" : ",", " ;SEMICOLON": ";" ," -DASH": "-"," ?QUESTIONMARK" : "?"} in_f = open(in_file,'r') out_f = open(out_file,'w') contents = in_f.read() for x in PUNCTUATION_MAPPING: contents = contents.replace(x,PUNCTUATION_MAPPING[x]) contents = truecase.get_true_case(contents) out_f.write(contents) in_f.close() out_f.close()
def calculateParameters(doc: str, scores: Dict[str, float], cands, pr: Dict[str, float] = None): params = [] max_cand_score = max(scores.values()) all_cands = cands for cand in all_cands: freq = doc.count(cand) # pagerank_score = pr[cand] if cand not in scores: cand_score = 0. else: cand_score = scores[cand] / max_cand_score cand_len = len(cand) cand_term_count = len(cand.split()) first_match = doc.find(cand) / len(doc) last_match = doc.rfind(cand) / len(doc) ne_cand = get_true_case(cand) words = nltk.pos_tag(nltk.word_tokenize(ne_cand)) ne = nltk.tree2conlltags(nltk.ne_chunk(words)) ne = [ ' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(ne, lambda tpl: tpl[2] != 'O') if key ] ne_cnt = len(ne[0].split()) if ne else 0 if first_match == last_match: spread = 0. else: spread = last_match - first_match params.append([ cand_score, cand_len, cand_term_count, first_match, 1 - last_match, ne_cnt ]) #, pagerank_score]) # , r[cand]]) params = np.array(params) max_ = params.max(axis=0) params = np.divide(params, max_, out=np.zeros_like(params), where=max_ != 0) return dict(zip(all_cands, params))
def get_url2description_and_vectors(url): lock.acquire() global count count += 1 caption = url2caption[url] lock.release() case_caption = truecase.get_true_case(caption) case_caption = preprocessing(case_caption) url2keywords, url2keywords_vector = FindKeywords(case_caption) lock.acquire() url2description[url] = url2keywords url2description_vector[url] = url2keywords_vector lock.release()
def truecase_sentence(tokens): previous_len = len(tokens) word_lst = [(w, idx) for idx, w in enumerate(tokens) if all(c.isalpha() for c in w)] lst = [w for w, _ in word_lst if re.match(r'\b[A-Z]+\b', w)] if len(lst) and len(lst) == len(word_lst): parts = truecase.get_true_case(' '.join(lst)).split() if len(parts) != len(word_lst): return tokens for (w, idx), nw in zip(word_lst, parts): tokens[idx] = nw assert len(tokens) == previous_len return tokens
def postprocess(cls, text: str) -> str: """Post-processes the text. Args: text (:obj:`str`): The text to be post-processed. Returns: :obj:`str`: The post-processed text. """ txt = ' '.join(sentence_tokenize(text)) return get_true_case(txt)
def create_data(self, sentence_4): query = self.get_query(sentence_4) entities_replacements = self.get_entities_replacements(sentence_4) sentence_3 = truecase.get_true_case( self.get_sentence_3(sentence_4, entities_replacements)) sentence_2 = truecase.get_true_case( self.get_sentence_2(sentence_3, entities_replacements)) sentence_1 = truecase.get_true_case(entities_replacements[0][2]) json_data = { 'query': query, 'sentence_4': sentence_4, 'sentence_3': sentence_3, 'sentence_2': sentence_2, 'sentence_1': sentence_1 } json_data = json.dumps(json_data, indent=4, sort_keys=False) print(json_data) self.sia_dataset_reader.write_dataset(query, sentence_4, sentence_3, sentence_2, sentence_1) return (query, sentence_4, sentence_3, sentence_2, sentence_1)
def dominant_entities_text(stanford_client: StfNERTagger, client_google, comment: str) -> tuple: """ Analyze the text and return the text entities and type entities :param stanford_client: connection to stanford client :param client_google: connection to google cloud :param comment: text to analyze :return: tuple with entities text and entitiies type """ if len(comment) == 0: return [], [], [] stf_person_types = stanford_client.identify_person_types( text=truecase.get_true_case(comment)) document = ClientsLanguageSentiment.convert_type_google_document( comment) encoding = enums.EncodingType.UTF32 entities_google_text = [] entities_google_salience = [] entities_google_type = [] try: entities_google = client_google.analyze_entities( document, encoding).entities for entity in entities_google: entity_type = enums.Entity.Type(entity.type) # Discard low density words or high density words that can be related with an entity detection from google, leading to high density sentences if len(str(entity.name)) <= 3 or len(str(entity.name)) >= 25: continue # Validates if Type.Person from GCloud is valid having Stanford as base line. # A lot of people names are not correctly detected by GCP but are from Stanford. if (entity_type == enums.Entity.Type.PERSON and entity.name.lower() in stf_person_types) or ( entity_type != enums.Entity.Type.PERSON and entity.name.lower() in stf_person_types) \ or entity_type in [enums.Entity.Type.NUMBER, enums.Entity.Type.PHONE_NUMBER, enums.Entity.Type.ADDRESS, enums.Entity.Type.PRICE, enums.Entity.Type.DATE]: continue entities_google_text.append(entity.name) entities_google_salience.append(entity.salience) entities_google_type.append(entity_type.name) entities_google_text = list(entities_google_text) entities_google_salience = list(map(str, entities_google_salience)) entities_google_type = list(set(entities_google_type)) except Exception as e: logger.error(msg=str(e)) return [], [], [] return entities_google_text, entities_google_salience, entities_google_type
def truecase_sentence(self, tokens): word_lst = [(w, idx) for idx, w in enumerate(tokens) if all(c.isalpha() for c in w)] lst = [w for w, _ in word_lst if re.match(r'\b[A-Z\.\-]+\b', w)] if len(lst) and len(lst) == len(word_lst): parts = truecase.get_true_case(' '.join(lst)).split() # the trucaser has its own tokenization ... # skip if the number of word dosen't match if len(parts) != len(word_lst): return tokens for (w, idx), nw in zip(word_lst, parts): tokens[idx] = nw return tokens
def calculateParameters(all_cands, doc, scores): params = [] max_cand_score = max(scores.values()) for cand in all_cands: freq = doc.count(cand) if cand not in scores: cand_score = 0. else: cand_score = scores[cand] # / max_cand_score cand_len = len(cand) cand_term_count = len(cand.split()) ne_cand = get_true_case(cand) words = nltk.pos_tag(nltk.word_tokenize(ne_cand)) ne = nltk.tree2conlltags(nltk.ne_chunk(words)) ne = [ ' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(ne, lambda tpl: tpl[2] != 'O') if key ] ne_cnt = len(ne[0].split()) if ne else 0 first_match = doc.find(cand) / len(doc) last_match = doc.rfind(cand) / len(doc) # if cand_term_count == 1: # cohesion = 0. # else: # cohesion = cand_term_count * (1 + math.log(freq, 10)) * freq / if first_match == last_match: spread = 0. else: spread = last_match - first_match # print([cand_score, freq, cand_len, cand_term_count, first_match, last_match, spread, ne_cnt]) params.append([ cand_score, cand_len, cand_term_count, first_match, last_match, spread, ne_cnt ]) #cand_score, return params
def truecase_sentence(tokens): """ from https://github.com/ghaddarAbs for experimenting with CoNLL-2003 casing """ word_lst = [(w, idx) for idx, w in enumerate(tokens) if all(c.isalpha() for c in w)] lst = [w for w, _ in word_lst if re.match(r'\b[A-Z\.\-]+\b', w)] if len(lst) and len(lst) == len(word_lst): parts = truecase.get_true_case(' '.join(lst)).split() # the trucaser have its own tokenization ... # skip if the number of word doesn't match if len(parts) != len(word_lst): return tokens for (w, idx), nw in zip(word_lst, parts): tokens[idx] = nw return tokens
def get_entities(DocList): ListDoc = [Preprocessing(truecase.get_true_case(sent)) for sent in DocList] 'https://www.geeksforgeeks.org/python-named-entity-recognition-ner-using-spacy/' black_list = ['CARDINAL', 'ORDINAL', 'DATE', 'ORG'] nlp = spacy.load('en_core_web_sm') Entities = [[ ent.text.lower() for ent in nlp(doc).ents if ent.label_ not in black_list and len(ent.text) > 2 ] for doc in ListDoc] #Entities = [[(ent.text, ent.label_) for ent in nlp(doc).ents if ent.label_ not in black_list and len(ent.text)>2] for doc in ListDoc] Entities = list( set([ ' '.join(List) for List in remove_stopwords(Entities) if len(List) > 0 ])) return Entities
def extract_graph_feats(segments): """ :param segments: List of text segments. Each segment is a string. :return: feats_dict: Dictionary mapping feature names to values """ feats_dict = {} # if segments are all lowercase, try to infer true case (this helps POS detection) for idx, seg in enumerate(segments): if seg.islower(): segments[idx] = truecase.get_true_case(seg) # break segments up into words #segments_mixed_case = [s.split(" ") for s in segments] segments_mixed_case = [] for s in segments: text = s.split(" ") if '' in set(text): text[:] = [w for w in text if w != ''] segments_mixed_case.append(text) # also get lowercase version (used for naive graph) #segments_lower_case = [s.lower().split() for s in segments] segments_lower_case = [] for s in segments: text = s.lower().split() if '' in set(text): text[:] = [w for w in text if w != ''] segments_lower_case.append(text) # build graphs naive_graph = create_naive_graph(segments_lower_case) lemma_graph = create_lemma_graph(segments_mixed_case) # POS detection is used to help with lemmatization pos_graph = create_pos_graph(segments_mixed_case) # compute features for each graph get_graph_metrics(naive_graph, 'naive', feats_dict) get_graph_metrics(lemma_graph, 'lemma', feats_dict) get_graph_metrics(pos_graph, 'pos', feats_dict) # add normalized versions of features word_count = get_word_count(segments) add_norm_feats(feats_dict, word_count) return feats_dict
def get_first_city(message: str) -> str: ''' returns the first city in the message Arguments: - message: the message provided Returns: - a city if found else empty string ''' corrected = correct(message) true_case = truecase.get_true_case(corrected) doc = nlp(true_case) for ent in doc.ents: if ent.label_ in ['GPE', 'PERSON']: city = gc.get_cities_by_name(ent.text) if city: return ent.text return ''
def to_truecase(tokens): """ # code from https://github.com/google-research/bert/issues/223#issuecomment-649619302 # original tokens #['FULL', 'FEES', '1.875', 'REOFFER', '99.32', 'SPREAD', '+20', 'BP'] # truecased tokens #['Full', 'fees', '1.875', 'Reoffer', '99.32', 'spread', '+20', 'BP'] """ word_lst = [(w, idx) for idx, w in enumerate(tokens) if all(c.isalpha() for c in w)] lst = [w for w, _ in word_lst if re.match(r'\b[A-Z\.\-]+\b', w)] if len(lst) and len(lst) == len(word_lst): parts = truecase.get_true_case(' '.join(lst)).split() # the trucaser have its own tokenization ... # skip if the number of word dosen't match if len(parts) != len(word_lst): return tokens for (w, idx), nw in zip(word_lst, parts): tokens[idx] = nw
def get_Captions_Entities(): Captions = {} nlp = spacy.load('en_core_web_sm') with open('Captions/Mira_ToBeLabeled.csv', newline='', encoding='ISO-8859-1') as f: # encoding='ISO-8859-1' reader = csv.reader(f) i = 0 for row in reader: if i == 0: i = 1 else: #doc = Preprocessing(truecase.get_true_case(row[3])) doc = truecase.get_true_case(row[3]) Entities = [ent.text.lower() for ent in nlp(doc).ents] Entities = list( set([ ' '.join(List) for List in remove_stopwords(Entities) if len(List) > 0 ])) Captions[row[2]] = Entities return Captions
def create_doc(text): if TRUE_CASE: content = truecase.get_true_case(text).replace('\n', ' ') else: content = text.replace('\n', ' ') return nlp(content)
def normalizeText_duc05(self, multi_doc): # Added list of not_hyphenated words that should correct in titles not_hyphenated = [ 'COLOMBIA-L.A.', 'MIAMI-TO-L.A.', 'Turkey-Syria', 'VW-Opel', 'GM-VW', 'O.C.-L.A.', 'Ever-more-complex', 'ARGENTINE-BRITISH', '46-MILLION', 'UK-Argentine', 'PRO-MARIJUANA', 'test-case', '2.46-MILLION', 'plasma-based', 'JOB-DEATH', 'GOLD-MINING', '1.3-MILLION', '150-MILLION-A-YEAR', 'Atom-smasher', 'LITHIUM-WATER', 'NERVE-GAS', 'drug-makers', 'EAST-WEST', '2-MILLION-SQUARE-MILE', 'Hume-Adams', 'BEAR-POACHERS', 'Robot-selected', 'self-rule', 'Ulster-style', '71-MILLION', '1.5-MILLION', '12-MILLION', '7-MILLION', 'Iran-U.S.' ] type_of_titles = ['HEAD', 'HL', 'HEADLINE', 'H3'] task_name = splitext(basename(self.filesList[0]))[0] if not multi_doc: while True: try: tree = ET.parse(self.filesList[0]) except (FileNotFoundError, PermissionError): self.filesList = (input( "The path doesn\'t contain file\'s name. Please enter complete path of a file containing file\'s name:\n"),) except ET.ParseError: with open(self.filesList[0]) as xmlFile: xml_content = xmlFile.read() xml_content = re.sub( r'(=)([0-9]+)(>)', r'\1"\2"\3', xml_content) xml_content = re.sub(r'&', r'&', xml_content) with open('{}.xml'.format(task_name), 'w') as new_xmlFile: new_xmlFile.write(xml_content) tree = ET.parse('{}.xml'.format(task_name)) remove('{}.xml'.format(task_name)) break else: break root = tree.getroot() for child in root: if child.tag not in self._xmlPart: self._xmlPart[child.tag] = child.text else: pass for subChild in child: if subChild.text is not None: self._xmlPart[child.tag] += subChild.text else: pass if child.tag == 'TEXT' and subChild.tag == 'F': self._xmlPart[child.tag] += subChild.tail text = self._xmlPart.get('TEXT') _title = [value for key, value in self._xmlPart.items() if key in type_of_titles] doc_title = '' if _title: doc_title += _title[0] doc_title = re.sub(r'\n\n', '\n', doc_title) doc_title = re.sub(r'([^.])\n', r'\1 ', doc_title) doc_title = re.sub(r'@', r'', doc_title) doc_title = re.sub(r' +----+ +.*', r'', doc_title) doc_title = re.sub(r'[``|"|:]', '', doc_title) doc_title = re.sub(r' +', ' ', doc_title) doc_title.strip() else: if len(self.filesList) == 1: while True: try: files = [file for file in listdir(self.filesList[0]) if isfile( join(self.filesList[0], file))] except NotADirectoryError: self.filesList = ( input("You should enter a folder directory:\n"),) else: break text, doc_title = '', '' for file in files: try: tree = ET.parse(join(self.filesList[0], file)) except ET.ParseError: with open(join(self.filesList[0], file)) as xmlFile: xml_content = xmlFile.read() xml_content = re.sub( r'(=)([0-9]+)(>)', r'\1"\2"\3', xml_content) xml_content = re.sub(r'&', r'&', xml_content) with open('{}.xml'.format(task_name), 'w') as new_xmlFile: new_xmlFile.write(xml_content) tree = ET.parse('{}.xml'.format(task_name)) remove('{}.xml'.format(task_name)) root = tree.getroot() xml_part = dict() self.find_tags_recursively(root, xml_part) temp_text = xml_part.get('TEXT') _title = [value for key, value in xml_part.items() if key in type_of_titles] title_text = '' if _title: title_text += _title[0].lstrip() title_text = re.sub(r'([^.])\n', r'\1 ', title_text) title_text = re.sub('FT.*[0-9]+ +/ +', '', title_text) title_text = re.sub(r'--+', r'', title_text) title_text = re.sub(r' +- *', r' ', title_text) title_text = re.sub(r'(``|\'\'|")', r'', title_text) # For removing ' from beginning of a quotation title_text = re.sub( r'(\s)\'(\w+)', r'\1\2', title_text) title_text = re.sub( r' *\(.*\)', r'', title_text, re.DOTALL) title_text = re.sub(r'\n', r' ', title_text) title_text = title_text.strip() if file.startswith('LA'): title_text = get_true_case(title_text) doc_title += '{}\n'.format(title_text) if temp_text: # For removing complete tables from text temp_text = re.sub( r'( +-{4,}\n)(.+\n)*( +-{4,}\n*)', r'', temp_text) temp_text = re.sub(r'([^.])\n', r'\1 ', temp_text) temp_text = temp_text.strip() text += '{}\n'.format(temp_text) doc_title = re.sub(r'\n\n', r'\n', doc_title) doc_title = re.sub(r'\n ', r'\n', doc_title) hyphen_words = re.findall( r'\s*(?:{})(?:\.|,|!|\?|/|\'|\s+)'.format('|'.join(not_hyphenated)), doc_title, re.I) if hyphen_words: hyphen_dict = dict() for word in hyphen_words: hyphen_dict.update({word: word.replace('-', ' ')}) hyphen_dict = dict((re.escape(k), v) for k, v in hyphen_dict.items()) pattern = re.compile("|".join(hyphen_dict.keys())) doc_title = pattern.sub( lambda m: hyphen_dict[re.escape(m.group(0))], doc_title) doc_title = doc_title.replace('POP/ROCK', 'POP ROCK') doc_title = doc_title.replace('/LOCAL', 'LOCAL') text = text.strip() text = re.sub(r'\.( \"[A|a]nd)', r'\1', text) text = re.sub(r'(\.\.\.|\. \. \.)', r'', text) text = re.sub(r'--', r'', text) text = re.sub(r'([A-Z.][A-Z.])( +\n+ +)', r'\1 .\2', text) text = re.sub(r'(Inc\.)( +\n\n+ +)', r'\1 .\2', text) text = re.sub(r'\n\n', '\n', text) text = re.sub(r'.(\" [a-zA-Z0-9]* said.)', r',\1', text) text = re.sub(r'(``|\'\'|")', r'', text) # for removing ' from beginning of a quotation text = re.sub(r'(\s)\'(\w+)', r'\1\2', text) text = re.sub(r' \. \. \.', r'', text) text = re.sub(r'(.)\s+(\n)', r'\1\2', text) text = re.sub(r'([^.])\n', r'\1 ', text) text = re.sub(r'^.*\[Text\] ', r'', text) text = re.sub(r'\[.*\] ', r'', text) text = re.sub(r'(\n) (\S)', r'\1\2', text) text = re.sub(r'(AG|GM|VW|Volkswagen|and|he|his|Essex)/' '(GM|Opel|General|or|she|her|London)', r'\1 \2', text) return text, doc_title, task_name
def generateReply(body, db_message): if not current_user.is_authenticated: return jsonify({"status": "Page Blocked", "authenticated": False}) # return jsonify({ # "chatbot_response" : { # "body": "Response", # "timestamp": 10, # "order": 2 # }, # "grammar_correction" : { # "body": "Response", # "timestamp": 10, # "order": 2 # } # }) user = current_user message = addPunctuation( spell_checker.correct_sentence(truecase.get_true_case( body))) # fix capitalization, spelling, and punctuation chatbot_body = '' print('Before Prediction') print('NUMBER OF THREADS: ', threading.active_count()) if ('bye' in message.lower()): chatbot_body = 'See you later!' else: chatbot_body = chatbot.predictResponse(context=message) print('DOne Predicting') chatbot_body = truecase.get_true_case(chatbot_body) grammar_correction_response = grammar_checker.check_grammar( input_sentence=message) stripped_message = stripChars(message) stripped_correction = stripChars(grammar_correction_response) grammar_body = '' if stripped_message == stripped_correction or len(message.split(' ')) <= 2: user.userData[0].correctSentences += 1 db_message.correct = 1 else: formatted_grammar_response = truecase.get_true_case( grammar_correction_response) grammar_body = 'Did you mean: ' + formatted_grammar_response db_message.correct = 0 order = 2 chatbot_response = Message(body=chatbot_body, author=user, order=order) db.session.add(chatbot_response) if grammar_body != '': grammar_correction = Message(body=grammar_body, author=user, order=order) db.session.add(grammar_correction) db.session.commit() return jsonify({ "chatbot_response": { "body": chatbot_response.body, "timestamp": chatbot_response.timestamp, "order": chatbot_response.order, }, "grammar_correction": { "body": grammar_body, "timestamp": 0 if grammar_body == '' else grammar_correction.timestamp, "order": 0 if grammar_body == '' else grammar_correction.order, } })
def _truecase_hook(raw): return truecase_.get_true_case(raw, out_of_vocabulary_token_option=oov)
temp = np.load('mapping/layer-8/europarl-v7.%s-%s.%s.GBDD' % (src, tgt, args.layer), allow_pickle=True) bias = torch.tensor(temp, dtype=torch.float).to(device) data = pd.read_csv(os.path.join('WMT17', 'testset', path), sep='\t') references = data['reference'].tolist() translations = data['translation'].tolist() source = data['source'].tolist() human_score = data['HUMAN_score'].tolist() sentBLEU = data['sentBLEU'].tolist() with MosesDetokenizer(src) as detokenize: source = [detokenize(s.split(' ')) for s in source] with MosesDetokenizer(tgt) as detokenize: references = [detokenize(s.split(' ')) for s in references] translations = [detokenize(s.split(' ')) for s in translations] translations = [truecase.get_true_case(s) for s in translations] xmoverscores = scorer.compute_xmoverscore(args.alignment, projection, bias, source, translations, ngram=args.ngram, \ layer=args.layer, dropout_rate=args.dropout_rate, bs=args.batch_size) lm_scores = scorer.compute_perplexity(translations, bs=1) scores = metric_combination(xmoverscores, lm_scores, [1, 0.1]) print('\r\nlp:{} xmovescore:{} xmoverscore+lm:{}'.format( lp, pearson(human_score, xmoverscores), pearson(human_score, scores)))
def geoparsing(self, text, case_correct=False, limit=5, gazetteer_cg=False): """ Method that performs the geoparsing of text, NOTE: use the geoparsing without the correct case and withour the gazetteer will give you poor results. Params: ---------- text : String - Text that to performs the geoparsing. case_correct: Bool - If the text is with correct case. limit: Int - Maximum limit of returned addresses. gazetteer_cg: Bool - If you want to use the gazetteer with locations in the state of ParaĆba. Return: ---------- result : List - List of addresses. """ if gazetteer_cg: result = self.filter_address_text(text.lower()) if result: return result else: raise Exception("Text geoparsing could not be performed") else: if case_correct: doc = self.nlp(text) ents_loc = list( filter( lambda entity: entity.label_ == "LOC" or entity.label_ == "GPE", doc.ents)) address_found = self.concantenate_address(ents_loc) result = self.check_address(address_found, limit) if result[0]: return result[1] else: raise Exception("Text geoparsing could not be performed") else: text = truecase.get_true_case(text) text_en = self.translator.translate(text, dest="en") text_en = text_en.text text_true_case = truecase.get_true_case(text_en) text_pt = self.translator.translate(text_true_case, src="en", dest="pt") text = text_pt.text doc = self.nlp(text) return self.geoparsing(text, case_correct=True)
def tc(string): string = truecase.get_true_case('A ' + string) return string[2:]
def tc(string): return truecase.get_true_case('A ' + string)[2:]
def preprocess_text (inputText): clean_text = '' clean_text = re.sub(' +', ' ', inputText) # remove double spaces clean_text = contractions.fix(clean_text) # contraction clean_text = truecase.get_true_case(clean_text) return clean_text
import truecase fi = open("allqueries.txt", "r") fo = open("allQueries.txt", "w") for q in fi: fo.write(truecase.get_true_case(q)) fo.write("\n") fo.close() fi.close()