Ejemplo n.º 1
0
def tokenize(articles):
    results = []
    tokenizer = MeCabTokenizer(
        user_dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    for article in articles:
        clean = Clean(article.html)
        cleaned = clean.clean_html_and_js_tags().clean_text().clean_code()
        tokens = tokenizer.extract_noun_baseform(cleaned.text)
        results.append(tokens)
    return list(chain.from_iterable(results))
Ejemplo n.º 2
0
 def add_schedule(self):
     self.__clear__()
     set_clean = Schedule(
     )  # calls an instance of schedule to be made and checked
     if set_clean.check():
         print "(+) Time to Clean "
         new_clean = Clean()
     else:
         print "(-) It is not yet time to clean "
     set_clean.save()
     set_clean.show()
     new = Menu()
Ejemplo n.º 3
0
def __main__():
    nj_municipals = json.load(open('./json/nj_municipals.json'))
    counties = list(nj_municipals.keys())

    if len(sys.argv) == 1:
        url, date = Link()
    elif len(sys.argv) == 2:
        _, date = Link()
        url = sys.argv[1]
    else:
        url = sys.argv[1]
        date = sys.argv[2]
        print(url)
        print(date)

    data = Parse(url, counties)
    total_df = Clean(csv_file, data, date, nj_municipals)
    Update(total_df, csv_file)
    Today(total_df, date, counties, json_file)
Ejemplo n.º 4
0
 def clean(sentence):
     return Clean(sentence).clean_html_and_js_tags().clean_text().text
Ejemplo n.º 5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
try:
    import datetime
    from Login import Login
    from readconfig import ReadConfig

    from clean import Clean
    lj = Login()
    dc = Clean()

    _info = ReadConfig()
except Exception as e:
    print('配置文件缺失%s' %e)
    print('输入enter停止')
    _k= input()
    os._exit(0)
def main():
    try:
        print('该程序用于清洗公司名称数据->修改配置文件即可运行')
        print('版本:1.3')
        print('输入enter开始')
        k1=input()
        #读取数据库原始数据
        print('正在清洗数据->请稍后..')
        info1 = _info.get_input("col_name")   #字段名称
        info2 = _info.get_input("table_name")  #表名->有模式需要加.配置完整表名
        sql1 = '''
        select {}
Ejemplo n.º 6
0
 def cleaning(self):
     self.__clear__()
     new_clean = Clean()
     new = Menu()
Ejemplo n.º 7
0
import pandas as pd
#import numpy as np

#org_data = pd.read_excel('Sharma_2018-07-24.xlsx', usecols='E:AJ')
#org_data= pd.read_excel('../data/all_data.xlsx', usecols='E:AJ')
#backup_data=org_data

from clean import Clean

clean = Clean(org_data)
clean.clean_data()
org_data = clean.org_data

from imputation import Impute

impute = Impute(org_data)
impute.impute_data()
org_data = impute.org_data
Ejemplo n.º 8
0
# Import own files
from clean import Clean
import vb_encoder

# Global definitions
csv.field_size_limit(2 ** 30)

NUM_DOCS = 17153  # for progress bar purposes only
COURT_RANKINGS = {
    3: ['sg court of appeal', 'sg privy council', 'uk house of lords', 'uk supreme court', 'high court of australia', 'ca supreme court'],
    2: ['sg high court', 'singapore international commercial court', 'hk high court', 'hk court of first instance', 'uk crown court', 'uk court of appeal', 'uk high court', 'federal court of australia', 'nsw court of appeal', 'nsw court of criminal appeal', 'nsw supreme court']
}

# Create instances of imported classes
cleaner = Clean()


def usage():
    print(
        "Usage: "
        + sys.argv[0]
        + " -i dataset-file -d dictionary-file -p postings-file"
    )


# Writes out the total number of documents in the collection to the postings file
# This is basically N, to compute inverse document frequency
def write_collection_size_to_disk(collection_size: int, out_postings):
    # Open our postings file
    f_postings = open(out_postings, "wb")
Ejemplo n.º 9
0
	def __init__(self):
		# self.classifier = HackAriba()
		self.clean = Clean()