def cleaning(): print('\n\n\ncleaning...') instance_of_workspace = Cleaner(list_of_paths_to_process) instance_of_workspace.show_basic_info() instance_of_workspace.save_df_as_image('output_images/data_as_is.png') instance_of_workspace.drop_redundant_data() instance_of_workspace.save_df_as_image('output_images/cleared_data.png')
def pre_process(): cleaning = Cleaner() engineering = FeatureEngineer() # Cleaning train_df, test_df, magic_df = load_data(DATA_DIR) train_df = train_df.iloc[:300] test_df = test_df.iloc[:100] len_train = len(train_df) train_df = cleaning.train(train_df) test_df = cleaning.test(test_df) magic_df = cleaning.magic(magic_df) logger.info('Finished cleaning...') # Feature Engineering df = pd.concat([train_df, test_df], sort=False) df = pd.merge(df, magic_df, on='listing_id') df = engineering.basic(df) df = engineering.manager_id(df) df = engineering.location(df) logger.info('Finished feature engineering...') # Target Encoding train_df, test_df = df.iloc[:len_train], df.iloc[len_train:] train_df, test_df = target_encoder(train_df, test_df) logger.info('Finished target encoding...') return train_df, test_df
def __init__(self, a, dest, c=1): self.dest = dest s = Publi() if a['source'] not in s.codex.keys(): prefix = "EUROPRESSE" source = a['source'] source_type = "unknown source" else: prefix = s.codex[a['source']]['abr'] source = s.codex[a['source']]['source'] source_type = s.codex[a['source']]['type'] self.filename = self.file_name(a['date'], prefix) text = a['title'] + "\r\n.\r\n" text += a['subtitle'] + "\r\n.\r\n" if a['subtitle'] else "" text += a['text'] ctx = [ "fileCtx0005", a['title'], source, "","", a['date'], source, source_type, "", "", "", "Processed by Tiresias on %s"\ % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "", "n","n", "" ] ctx = "\r\n".join(ctx) if (c): cl_txt = Cleaner(text.encode('utf-8')) text = cl_txt.content.encode('latin-1', 'xmlcharrefreplace') #to bytes cl_ctx = Cleaner(ctx.encode('utf-8')) ctx = cl_ctx.content.encode('latin-1', 'xmlcharrefreplace') #to bytes else: ctx = ctx.encode('latin-1', 'xmlcharrefreplace') #to bytes text = text.encode('latin-1', 'xmlcharrefreplace') #to bytes path = os.path.join(self.dest, self.filename + ".txt") with open(path, 'wb') as f: f.write(text) path = os.path.join(self.dest, self.filename + ".ctx") with open(path, 'wb') as f: f.write(ctx)
def write_prospero_files(self, save_dir=".", cleaning=False): """for each article, write txt and ctx in a given directory""" for article in self.articles.values(): filepath = file_name(article['date'], article['root'], save_dir) path = os.path.join(save_dir, filepath + ".txt") article['text'] = article['title'] + "\r\n.\r\n" + article['text'] if cleaning: text_cleaner = Cleaner(article['text'].encode('utf-8')) text = text_cleaner.content else: text = article['text'] with open(path, 'wb') as file: #to bytes file.write(text.encode('latin-1', 'xmlcharrefreplace')) ctx = [ "fileCtx0005", article['title'], article['support'], "", "", article['date'], "", article['source_type'], "", "", "", "Processed by Tiresias on %s"\ % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "", "n", "n", "" ] ctx = "\r\n".join(ctx) ctx = ctx.encode('latin-1', 'xmlcharrefreplace') #to bytes path = os.path.join(save_dir, filepath + ".ctx") with open(path, 'wb') as file: file.write(ctx)
def write_txt(path, text): text_cleaner = Cleaner(text.encode('utf-8')) text = text_cleaner.content.encode('latin-1', 'xmlcharrefreplace') #to bytes #text = text.encode('latin1', errors='xmlcharrefreplace') with open(path, 'wb') as f: f.write(text)
def __init__(self, url): dest = "C:\\corpus\\EnergiCorpus\\FR\\TEE\\" with urllib.request.urlopen(url) as page: soup = BeautifulSoup(page, "lxml") title = soup.title.string author = soup.find("div", "meta-author").text date = soup.find("div", "meta-date").text title = re.sub(" - Transitions & Energies", "", title) print(title, author, date) content = title + "\r\n.\r\n\r\n" article = soup.find('article') for el in article.find_all(['h2', 'p']): if el.name == "h2": content += "\r\n\r\n" + el.text + "\r\n.\r\n" else: content += el.text date = formate_date(date) ctx = formate_ctx(title, date, url) ctx_cleaner = Cleaner(ctx.encode('utf-8')) ctx = ctx_cleaner.content.encode('latin-1', 'xmlcharrefreplace') #to bytes text_cleaner = Cleaner(content.encode('utf-8')) text = text_cleaner.content.encode('latin-1', 'xmlcharrefreplace') #to bytes filename = file_name(dest, date, "TEE") path = os.path.join(dest, filename + ".txt") with open(path, 'wb') as f: f.write(text) path = os.path.join(dest, filename + ".ctx") with open(path, 'wb') as f: f.write(ctx)
def create_ctx(path, metadata): ctx = [ "fileCtx0005", metadata['title'], metadata['authors'], "", "", "01/01/" + metadata['date'], "", "chapitre", metadata['ref'], "", "", "Processed by Tiresias on %s"\ % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "", "n", "n", "" ] ctx = "\r\n".join(ctx) ctx_cleaner = Cleaner(ctx.encode('utf-8')) ctx = ctx = ctx_cleaner.content.encode('latin-1', 'xmlcharrefreplace') #to bytes with open(path, 'wb') as file: file.write(ctx)
def ctx_prospero( csvfile, save_dir=".", cleaning=False, brackets=False, author_keywords=False, index_keywords=False, rm_copyright=False, ): """convert ctx to prospero format files""" reader = csv.DictReader(csvfile, delimiter=",") papers = {} file_count = 0 no_abstract = 0 for row in reader: link = row['Link'] eid = re.search(r'eid=([^\&]*)\&', link).group(1) if row['Abstract'] == "[No abstract available]": no_abstract += 1 else: papers[eid] = [ row['Authors'], row['Title'], row['Year'], row['Abstract'], row['Author Keywords'], row['Index Keywords'] ] for eid in papers: #remove the traductions between [] in title if brackets: papers[eid][1] = re.sub(r"\[.*\]$", "", papers[eid][1]) #put the title at the beginning of the text txt_content = papers[eid][1] + "\r\n.\r\n" #put author keywords if author_keywords: if papers[eid][4]: txt_content += papers[eid][4] + "\r\n.\r\n" #put index keywords if index_keywords: if papers[eid][5]: txt_content += papers[eid][5] + "\r\n.\r\n" #remove © if rm_copyright: papers[eid][3] = re.sub(" (©|Copyright),? \d{4},? .*$", "", papers[eid][3]) #put text content txt_content += papers[eid][3] if cleaning: text_cleaner = Cleaner(txt_content.encode('utf-8')) txt_content = text_cleaner.content txt_content = txt_content.encode('latin-1', 'xmlcharrefreplace') #to bytes filename = os.path.join(save_dir, eid) with open("%s.txt" % filename, 'wb') as txtfile: txtfile.write(txt_content) file_count += 1 ctx = [ "fileCtx0005", papers[eid][1], papers[eid][0], "", "", "01/01/%s"%papers[eid][2], "", "", "", "", "", "From Scopus by Tiresias on %s"\ % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "", "n", "n", "" ] ctx = "\r\n".join(ctx) with open("%s.ctx" % filename, 'wb') as ctxfile: ctx = ctx.encode('latin-1', 'xmlcharrefreplace') #to bytes ctxfile.write(ctx) file_count += 1 return file_count, no_abstract
from cleaning import Cleaner from Scrapper import Scrapper import pandas as pd if __name__ == "__main__": cleaner = Cleaner("../cache") print("Enter 1800<year<1900") year = int(input()) if year >= 1800 and year <= 1900: print("=======> " + str(year)) arks = Scrapper.get_arks(year) for ark in arks: print("=======>" + ark) print(f"- download {ark}") file = Scrapper.get_document(ark) print("- Extraction {ark}") df = cleaner.extract(file) print(f"{df.shape[0]} rows detected") print(f"- Post processing {ark}") df = cleaner.postProcess(df) print("- Spell checking") df = cleaner.spell_check(df) print("- saving ") cleaner.save(df, ark) print(" finnished " + ark) print("\n") del file del df