def make_references(data): references = [] for i in data: ref = i.get('reference') text = i.get('text') clean = clean_text(text) R = Reference(ref, clean) references.append(R) return references
def carga_textos(folder,termina): try: lista_textos=[]#lista de contenidos lista_archivos=os.listdir(folder) lista_txt=[archivo for archivo in lista_archivos if archivo.endswith(termina)] #a leer! for archivo in lista_txt: texto=lector.leer_archivo(os.path.join(folder,archivo)) texto_limpio=clean.clean_text(texto) lista_textos.append(texto_limpio) except IOError as e: pirnt(e) lista_textos=[] return lista_textos
def get_nlp(self, text, display = False): clean_text = clean.clean_text(text) doc = self.nlp(clean_text) if display: displacy.serve(doc, style="dep") return doc
def _clean_(self, text): return clean_text(text)
def read_datapoints(FILE_PATH: str) -> List[Dict]: with open(FILE_PATH) as f: reader = csv.DictReader(f, delimiter='\t', fieldnames=['target', 'sms']) return [row for row in reader] if __name__ == "__main__": args = read_args() data = read_datapoints(args.data_path) df = pd.DataFrame(data) df['sms'] = df['sms'].apply(lambda x: clean_text(x)) df['sms_length'] = df['sms'].apply(len) df['target'] = df['target'].replace({'ham': True, 'spam': False}) df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) kf = StratifiedKFold(n_splits=5) for f, (t_, v_) in enumerate(kf.split(X=df, y=df.target)): df.loc[v_, 'kfold'] = f df.to_csv(os.path.join(args.output_dir, 'cleaned_data.csv'), index=False, sep='\t')
import clean articles = [] def give_prompt(): print("----------") print("Currently stored: %s\nPlease enter each article below, followed by 'end'. Enter 'stop' when finished.\n" % len(articles)) recent_inputs = [] give_prompt() while True: text_input = input("> ") + "\n" if text_input.lower() == "save\n": with open("output.txt", "w") as outfile: for i in range(len(articles)): outfile.write("ARTICLE %s of %s -- Ordering: ___ / %s\n\n" % (i, len(articles), len(articles))) outfile.write(articles[i]) outfile.write("\n----------\n\n") break if text_input.lower() == "end\n": article = "".join(recent_inputs) cleaned = clean.clean_text(article) print(cleaned) articles.append(cleaned) recent_inputs = [] give_prompt() else: recent_inputs.append(text_input)
def normalize(file_text): file_text = clean_html_tags(file_text) file_text = clean_text(file_text) # file_text = normalize_number(file_text) file_text = neologdn.normalize(file_text) return file_text