def realize_pas(pas): """ Produce the sentence realization given a PAS. :param pas: PAS to realize. :return: realized PAs. """ phrase = "" raw_pas = pas.raw_pas # Adding spaces to avoid errors like finding "he" in "the" # Removing punctuation, just need to find the position of the arguments. full_sent = remove_punct(" " + pas.sentence + " ") args_positions = [] # For every value of the SRL dictionary, the position of this value is found in the original sentence and # placed in a list which is sorted by position afterwards. redundant_modals = ["going", "has", "have", "had"] for arg_key in raw_pas.keys(): arg_val = raw_pas[arg_key] # Excluding "not" and modals that might be repeated in the verb fixing process. if arg_key != "AM-NEG" and not (arg_key == "AM-MOD" and arg_val in redundant_modals): arg = (" " + remove_punct(arg_val) + " ").replace(" ", " ") arg_index = full_sent.find(arg) # Verbs has to be fixed as SENNA clears auxiliaries and modals. if arg_key == "V": arg_val = fix_verb(pas) arg_pos = (arg_index, arg_val) args_positions.append(arg_pos) # Sorting the arguments sorted_args = sorted(args_positions, key=lambda tup: tup[0]) # Building the phrase by spacing the arguments. for arg_pos in sorted_args: phrase += arg_pos[1] + " " # De-spacing the contracted form (I 'm to I'm). phrase = re.sub("([a-zA-Z0-9]) \'([a-zA-Z0-9])", r"\1'\2", phrase) # De-spacing apices and parentheses (' " " ' to '" "'). phrase = re.sub("\" ([a-zA-Z0-9 ,']+) \"", r'"\1"', phrase) phrase = re.sub("\( ([a-zA-Z0-9 ,']+) \)", r'"\1"', phrase) # De-spacing punctuation. phrase = re.sub(" [.,:;] ", r", ", phrase) return phrase
def tag_product(product_title): """ Tag product_title and return core term, brand name, and discriptions. Input: string: product_title Return: string: core_term string: brand string: disc """ ## build a tagger model with open(Table_PATH, 'rb') as f: tag_table = pickle.load(f) tagger = nltk.UnigramTagger(model=tag_table, backoff=nltk.DefaultTagger('D')) ## remove punctuations from product title product_title_tmp = remove_punct(product_title) ## convert plurals to singulars wnl = nltk.WordNetLemmatizer() product_words = [wnl.lemmatize(s) for s in product_title_tmp.split()] clean_title = ' '.join(product_words) ## build unigrams, bigrams, trigrams from which product ## attributes are to be extracted. unigrams = extract_words(product_words) bigrams = [' '.join(item) for item in ngrams(unigrams, 2)] trigrams = [' '.join(item) for item in ngrams(unigrams, 3)] ## Extract attributes from trigrams. If failed, extract from bigrams. ## If still failed, extract from unigrams. If still failed, set the ## last alpha noun as product core term and leave brand empty. core_term, brand = None, None core_term, brand = extract_attributes(trigrams, core_term, brand, tagger) if not core_term or not brand: core_term, brand = extract_attributes(bigrams, core_term, brand, tagger) if not core_term or not brand: core_term, brand = extract_attributes(unigrams, core_term, brand, tagger) if not core_term: pos_words = nltk.pos_tag(unigrams) for word, tag in pos_words[::-1]: if tag == 'NN' and word.isalpha(): core_term = word break if not brand: brand = '' ## The words other than the core term and brand name are regarded as ## description information. try: disc = clean_title.replace(core_term, '').replace(brand, '') disc = ' '.join(w for w in disc.split()) except Exception, e: info.logging('Cannot find core terms from the product title')
def pipeline(self, text): text = utils.remove_space(text) text = utils.remove_punct(text) text = utils.remove_contractions(text.lower(), contractions) text = utils.remove_url(text) text = utils.remove_html(text) text = utils.correct_spellings(text) return text
def process_names(file_dir): """ For the brand and product names containing '-', generate two names: one take '-' as ' ' and the other as '' e.g. 'e-book' --> 'e book' and 'ebook' The other punctuations except \. and \' are excluded. """ results = [] with open(file_dir, 'rb') as f: lines = f.readlines() for line in lines: if '-' in line: line1 = re.sub("-", ' ', line) results.append(remove_punct(line1)) line2 = re.sub("-", '', line) results.append(remove_punct(line2)) else: results.append(remove_punct(line)) return list(set(results))
def preprocess(text): delete_list = [",", "’"] tweet = utils.delete_characters_space(text, delete_list) word_list = tweet.split() word_list = [utils.stem_word(correction.correction( \ utils.remove_punct(utils.remove_repeating_char(utils.remove_with_regex(word))))) \ for word in word_list] word_list = [word for word in word_list if len(word) > 1] word_list = utils.remove_words(word_list, STOPWORDS) sentence = "" for word in word_list: sentence = sentence + " " + word return(sentence)
def cure_text(text): text = re.sub(r"&=\S+", "", text) text = re.sub(r"\[.+?\]", "", text) text = re.sub(r"@s:\S+", "", text) text = text.replace("+&", "") text = text.replace("xxx", "") text = text.replace("0", "") text = text.replace("&", "") text = text.replace("☺", "") text = text.replace("▔", "") text = text.replace("\n", " ") text = remove_punct(text) text = text.lower() return " ".join(map(lambda x: x.strip(), text.split(" ")))
def prepare_commonvoice(commonvoice_location, audio_path, text_path, lists_path, processes): for f in ['dev', 'test', 'train']: dst_list = os.path.join(lists_path, f"commonvoice-{f}.lst") dst_text = os.path.join(text_path, f"commonvoice-{f}.txt") if not os.path.exists(dst_list): to_list = partial(commonvoice_to_list, audio_path, f, commonvoice_location) with Pool(processes) as p: rows = read_tsv(os.path.join(commonvoice_location, f"{f}.tsv")) samples = list(tqdm( p.imap(to_list, rows), total=len(rows), )) with open(dst_list, "w") as list_f: list_f.writelines(samples) with open(dst_list, "r") as list_f, open(dst_text, "w") as text_f: for line in list_f: text_f.write(" ".join(line.strip().split(" ")[3:]) + "\n") else: print(f"{dst_list} exists, doing verify") new_list = [] with open(dst_list, "r") as list_f: for line in list_f: filename = line.split(" ")[1] text = " ".join(line.strip().split(" ")[3:]) params = " ".join(line.strip().split(" ")[:3]) text = remove_punct(text) line = f"{params} {text}\n" if not os.path.exists(filename) or len( text) < 2 or not alpha.match(text): print( f"{filename} does not exists or text is empty, text: {text}" ) else: new_list.append(line) with open(dst_list, "w") as list_f: list_f.writelines(new_list) print("Prepared CommonVoice", flush=True)
def fix_verb(pas): """ Fixes the verb by checking on previous verbs/auxiliaries in the original sentence. :param pas: PAS containing the verb to be fixed. :return: fixed verb. """ raw_pas = pas.raw_pas pos = dict(pas.parts_of_speech) verb = raw_pas["V"] words = remove_punct(pas.sentence).split() verb_index = 0 # Fetching the verb location in the original sentence. if verb in words: verb_index = words.index(verb) # Checking if in 4 words preceding the verb are auxiliaries/modals. if verb in pos.keys(): if pos[verb].startswith("VB"): verb_prefix = "" for i in range(1, 5): if verb_index - i >= 0: if words[verb_index - i] in pos.keys(): if pos[words[verb_index - i]].startswith("VB") or words[verb_index - i] == "not" or \ words[verb_index - i] == "to": verb_prefix = words[verb_index - i] + " " + verb_prefix else: break else: break # Excluding the cases in which the only part added is "to". if not (verb_prefix.startswith("to")): verb = verb_prefix + verb return verb
def cure_text(text): text = remove_punct(text) text = text.lower() return " ".join(map(lambda x: x.strip(), text.split(" ")))
def extract_pas(sentences): """ Extracts the PASs from a list of sentences ( :param sentences: sentences from which to extract PAS. """ # Compute the TFIDF vector of all terms in the document. tf_idfs = tf_idf(sentences, os.getcwd() + "/data/idfs.dat") # Longest sentence length needed afterwards for the length score. longest_sent_len = max(len(sent) for sent in sentences) pas_list = [] for sent in sentences: # Ignoring short sentences (errors). if 3 < len(remove_punct(sent)) and len(sent) < 1000: sent_index = sentences.index(sent) # Substituting single apices with double apices to avoid errors with SRL. sent = re.sub("\'([a-zA-Z0-9])([a-zA-Z0-9 ]+)([a-zA-Z0-9])\'", r'" \1\2\3 "', sent) annotations = _annotator.get_annoations(remove_punct(sent).split()) # Getting SRL annotations from SENNA. sent_srl = annotations['srl'] # Getting POS tags from SENNA. parts_of_speech = annotations['pos'] for raw_pas in sent_srl: accept_pas = 1 out_of_order = 0 chk_sent = remove_punct(sent) # Rejecting PASs with arguments that change the order (w.r.t. to the one of the original sentence); # These represents the 10% of the total PASs and the 80% of them are incorrect. for arg in raw_pas.values(): # Replacing double spaces with a single space to avoid some arguments to be ignored. arg = remove_punct(arg.replace(" ", " ")) if chk_sent.find(arg) < 0: accept_pas = False out_of_order = 1 break if accept_pas: pas = Pas(sent, parts_of_speech, sent_index, sent_srl.index(raw_pas), raw_pas, out_of_order) pas_list.append(pas) # Completing each PAS with its realization embeddings and vector representation. # This process is done after the initialization as all the other pas are needed. realized_pass = [] for pas in pas_list: realized_pass.append(realize_pas(pas)) # Here the title is put together with the pass to avoid starting another embedding process realized_pass.append(sentences[0]) pas_embeddings = sentence_embeddings(realized_pass) # Get the centrality scores for the pas embeddings pas_centralities = centrality_scores(pas_embeddings) for pas in pas_list: pas_index = pas_list.index(pas) pas.complete_pas( realized_pass[pas_index], pas_embeddings[pas_index], len(sentences), longest_sent_len, tf_idfs, pas_centralities[pas_index], np.inner(np.array(pas_embeddings[pas_index]), np.array(pas_embeddings[-1]))) return pas_list
def prefix_suggestion(word): temp = remove_punct(word).lower() temp = re.split(' ', temp)[0:4] suggestion = [' '.join(temp[0:i]) for i in range(1, min(4, 1 + len(temp)))] return suggestion
def keynorm(word): return remove_punct(word).lower()