def regex_detection(self, sentence, full_text=None, offset=0): """ Detect entities with a regex in sentence Keyword arguments: sentence -- a sentence in plain text """ # dict to store detections unconsolidated_broad_list = [] result_broad_list = self._detect_regexp(sentence, "broad") strict_list = copy.deepcopy(self._detect_regexp(sentence, "strict")) consolidated_list = [clean_text(regexp[0]) for regexp in strict_list] for _broad_regexp in result_broad_list: if clean_text(_broad_regexp[0]) not in consolidated_list: unconsolidated_broad_list.append(_broad_regexp) # check proximity conditions of broad regexp detections # Si no se inicializa a [] se duplican resultados consolidated_broad_list = self._check_proximity_conditions( unconsolidated_broad_list, full_text, offset) # Validate validate_list, strict_list, consolidated_broad_list = self._validate( strict_list, consolidated_broad_list) unconsolidated_broad_list = self._remove_unconsolidated_matches( consolidated_broad_list, unconsolidated_broad_list) return strict_list, consolidated_broad_list, unconsolidated_broad_list, validate_list
def predict(self, text): words = self.tokenizer.preprocess(clean_text(text)) if len(words) < self.seq_length: words += ['<pad>'] * (self.seq_length - len(words)) elif len(words) > self.seq_length: words = words[:self.seq_length] tokens = [self.tokenizer.vocab.stoi[word] for word in words] tokens = torch.LongTensor(tokens).unsqueeze(0) reference_tokens = self.token_reference.generate_reference( self.seq_length, device='cpu').unsqueeze(0) pred = self.model(tokens) plabel = int(torch.argmax(pred, 1)) pred = pred.tolist()[0] unpad_index = [ idx for idx, word in enumerate(words) if word != '<pad>' ] unpad_words = [word for word in words if word != '<pad>'] attributions = [] for label in range(len(pred)): attributions.append( list( self.attribute(tokens, reference_tokens, label, unpad_index))) return unpad_words, pred, plabel, attributions
def _validate_list(self, validate_list, _list): new_list = [] for regexp in _list: ent = clean_text(regexp[0]) # print("ent: " + ent) if self._func_validate(ent): # print("\tent:" + ent + " is valid!") validate_list.append(regexp) else: # print("\tent:" + ent + " NO valid!") new_list.append(regexp) return validate_list, new_list
def parse_table(soup): table = {} name = clean_text(soup.find('tbody').find('tr').text) table[name] = [] if 'No Record Found' in soup.text: return {} headers = [ clean_text(x.text) for x in soup.find('tbody').find_all('tr')[1].find_all('td') ] table[name].append(headers) for row in soup.find_all('tbody')[1].find_all('tr'): if 'Total' in row.text: # Lets skip the row which gives the total continue values = [clean_text(x.text) for x in row.find_all('td')] if len(values) < len(headers): # HACK: Assuming that if less columns than expected, left # side cells are merged values.insert(0, table[name][1][0]) table[name].append(values) return table
def train_classifiers(data_path, save_path, multithreading, lang='eng'): # Read data from provided location dataframe = pd.read_csv(data_path, header=0, sep=";") dataframe = dataframe.dropna() print("Cleaning text...", end="") dataframe['text'] = dataframe['text'].map( lambda x: utils.clean_text(x, lang)) print("Done") #Train test split train, test, label_train, label_test = utils.create_train_test( dataframe["text"], dataframe["label"]) # TFIDF Bag Of Words extraction. Theese lines extract the features from text based on word frequencies among # texts print("Creating TFIDF data") unwanted = stopwords.words(lang) tfidf_vect = TfidfVectorizer(analyzer='word', ngram_range=range(1, 3), max_features=3000, stop_words=unwanted, max_df=0.5, min_df=3) tfidf_vect.fit(train) # saving the vectorizer model tmp = os.path.join(save_path, "Models") save = os.path.join(tmp, "vectorizer.pkl") with open(save, 'wb') as pklfile: pkl.dump(tfidf_vect, pklfile) train_bow = tfidf_vect.transform(train) test_bow = tfidf_vect.transform(test) print("Beginning supervised training...") if multithreading: parallel_pipeline(train_bow, test_bow, label_train, label_test, save_path) else: classifiers_pipeline(train_bow, test_bow, label_train, label_test, save_path)
def test_clean_text(self): processed_test = clean_text("http://test.site @mention") self.assertEqual(processed_test, " ")
def main(args) -> dict: """ Execute the command. :param name: name to use in greeting """ output = {} headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://cra-nsdl.com/CRA/', } sensor = NpsSensor('finance/nps', URLS['base'], headers=headers, creds=True) @retry(CaptchaError, tries=5, delay=5) def login(): sensor.download_captcha(URLS['login']) captcha = sensor.solve_captcha() logger.info(f"Captcha = {captcha}") login_url = URLS['login'] + ';' + sensor.session.cookies['JSESSIONID'] data = { 'userID': sensor.credentials['username'], 'password': sensor.credentials['password'], 'subCaptchaVal': captcha, } # Lets login sensor.post(url=login_url, data=data) if "Your Password has expired." in sensor.response.text: raise LoginError("The Password has expired") sensor.dump_html('login-out.html') if sensor.soup.find('div', {'class': 'login-tab'}): # sensor.dump_html('login-error.html') # if 'Please enter correct captcha code' in sensor.response.text: raise CaptchaError("Captcha was not validated") logger.info("Success!!") logger.info("Logging in now ..") login() if not "Welcome Subscriber" in sensor.soup.text: sensor.dump_html("login-error.html") raise LoginError(f"Login did not work") # sensor.dump_html('login-success.html') id = sensor.get_id() sensor.get(URLS['account_details'].format(id=id)) # sensor.dump_html('account.html') # sensor.read_html('account.html') def parse_table(soup): table = {} name = clean_text(soup.find('tbody').find('tr').text) table[name] = [] if 'No Record Found' in soup.text: return {} headers = [ clean_text(x.text) for x in soup.find('tbody').find_all('tr')[1].find_all('td') ] table[name].append(headers) for row in soup.find_all('tbody')[1].find_all('tr'): if 'Total' in row.text: # Lets skip the row which gives the total continue values = [clean_text(x.text) for x in row.find_all('td')] if len(values) < len(headers): # HACK: Assuming that if less columns than expected, left # side cells are merged values.insert(0, table[name][1][0]) table[name].append(values) return table rawdata = {} for t in sensor.soup.find_all('table', {'class': 'table-newnorow'}): table_data = parse_table(t) rawdata.update(table_data) prefs = list_of_lists_to_dict(rawdata['Current Scheme Preference'], "Scheme Details") summary = list_of_lists_to_dict( rawdata['Account Summary For Current Schemes'], "Scheme Name") date = clean_text(sensor.soup.find(id='stddate').span.text) pran = clean_text(sensor.soup.find(id='pranno').text) for scheme, scheme_data in summary.items(): scheme_data['Percentage'] = prefs[scheme]['Percentage'] scheme_data['Date'] = date scheme_data['PRAN Number'] = pran output[scheme] = scheme_data logger.info("Logging off ..") sensor.get(URLS['logoff'].format(id=sensor.get_id())) return output
# Dropping N/A's df_randomized = df_randomized.dropna() n_features = 1000 tf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.98, min_df=3, max_features=n_features) cleaned_texts = [] for t in list(df_randomized.loc[:, 'A/P']): # Doing some cleaning of the text t = str(t) blob = TextBlob(t) good_words = [n for n, t in blob.tags if t == 'NN' or t == 'JJ'] good_words = utils.clean_text(' '.join(good_words)) wnl = WordNetLemmatizer() good_words = [wnl.lemmatize(i) for i in good_words.split(' ')] cleaned_texts.append(' '.join(good_words)) tf = tf_vectorizer.fit_transform(cleaned_texts) # Bayesian optimization def optimize(x): clf = SGDClassifier(loss=x[0], penalty=x[1], alpha=x[2], random_state=0) scores = cross_val_score(clf, tf.toarray(), df_randomized['target'], cv=5, scoring='f1_macro') return -1 * scores.mean()
def clean_entity(text): return clean_text(text)