def tokenize(articles): results = [] tokenizer = MeCabTokenizer( user_dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd') for article in articles: clean = Clean(article.html) cleaned = clean.clean_html_and_js_tags().clean_text().clean_code() tokens = tokenizer.extract_noun_baseform(cleaned.text) results.append(tokens) return list(chain.from_iterable(results))
def add_schedule(self): self.__clear__() set_clean = Schedule( ) # calls an instance of schedule to be made and checked if set_clean.check(): print "(+) Time to Clean " new_clean = Clean() else: print "(-) It is not yet time to clean " set_clean.save() set_clean.show() new = Menu()
def __main__(): nj_municipals = json.load(open('./json/nj_municipals.json')) counties = list(nj_municipals.keys()) if len(sys.argv) == 1: url, date = Link() elif len(sys.argv) == 2: _, date = Link() url = sys.argv[1] else: url = sys.argv[1] date = sys.argv[2] print(url) print(date) data = Parse(url, counties) total_df = Clean(csv_file, data, date, nj_municipals) Update(total_df, csv_file) Today(total_df, date, counties, json_file)
def clean(sentence): return Clean(sentence).clean_html_and_js_tags().clean_text().text
#!/usr/bin/env python # -*- coding: utf-8 -*- import os try: import datetime from Login import Login from readconfig import ReadConfig from clean import Clean lj = Login() dc = Clean() _info = ReadConfig() except Exception as e: print('配置文件缺失%s' %e) print('输入enter停止') _k= input() os._exit(0) def main(): try: print('该程序用于清洗公司名称数据->修改配置文件即可运行') print('版本:1.3') print('输入enter开始') k1=input() #读取数据库原始数据 print('正在清洗数据->请稍后..') info1 = _info.get_input("col_name") #字段名称 info2 = _info.get_input("table_name") #表名->有模式需要加.配置完整表名 sql1 = ''' select {}
def cleaning(self): self.__clear__() new_clean = Clean() new = Menu()
import pandas as pd #import numpy as np #org_data = pd.read_excel('Sharma_2018-07-24.xlsx', usecols='E:AJ') #org_data= pd.read_excel('../data/all_data.xlsx', usecols='E:AJ') #backup_data=org_data from clean import Clean clean = Clean(org_data) clean.clean_data() org_data = clean.org_data from imputation import Impute impute = Impute(org_data) impute.impute_data() org_data = impute.org_data
# Import own files from clean import Clean import vb_encoder # Global definitions csv.field_size_limit(2 ** 30) NUM_DOCS = 17153 # for progress bar purposes only COURT_RANKINGS = { 3: ['sg court of appeal', 'sg privy council', 'uk house of lords', 'uk supreme court', 'high court of australia', 'ca supreme court'], 2: ['sg high court', 'singapore international commercial court', 'hk high court', 'hk court of first instance', 'uk crown court', 'uk court of appeal', 'uk high court', 'federal court of australia', 'nsw court of appeal', 'nsw court of criminal appeal', 'nsw supreme court'] } # Create instances of imported classes cleaner = Clean() def usage(): print( "Usage: " + sys.argv[0] + " -i dataset-file -d dictionary-file -p postings-file" ) # Writes out the total number of documents in the collection to the postings file # This is basically N, to compute inverse document frequency def write_collection_size_to_disk(collection_size: int, out_postings): # Open our postings file f_postings = open(out_postings, "wb")
def __init__(self): # self.classifier = HackAriba() self.clean = Clean()