def get_disambi_title(self, infile):
     disambi_title = {}
     for line in LoadFile.readline(infile):
         words = line.strip().split("\",\"")
         title_tmp = Clean.clean_word(words[1], clean_level="title")
         disambi_tmp = Clean.clean_word(words[0], clean_level="disambi")
         #            title_tmp = title_tmp.strip().strip("\"")
         disambi_title[disambi_tmp] = title_tmp
     return disambi_title
Example #2
0
def tokenize(articles):
    results = []
    tokenizer = MeCabTokenizer(
        user_dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    for article in articles:
        clean = Clean(article.html)
        cleaned = clean.clean_html_and_js_tags().clean_text().clean_code()
        tokens = tokenizer.extract_noun_baseform(cleaned.text)
        results.append(tokens)
    return list(chain.from_iterable(results))
Example #3
0
def main():
    with open("./410_baidu/410_disambi_infobox.csv", 'r',
              encoding='UTF-8') as inf:
        lines = inf.readlines()
        f = open("./410_baidu/410_disambi_infobox_out.csv",
                 "w",
                 encoding='utf-8')
        list_attr = []
        title_list = get_word_list("./410_baidu/410_title.csv")
        err_count = 0
        counts = {}
        for line in tqdm(lines):
            words = line.strip().split(",")
            disambi = Clean.clean_word(words[0], clean_level='disambi')
            infobox = ",".join(words[1:])
            try:
                info_dict = json.loads(json.loads(infobox))
                for attr in info_dict.keys():
                    clean_attr = Clean.clean_word(attr)
                    info_dict[clean_attr] = info_dict.pop(attr)
                    value = info_dict[clean_attr]
                    clean_attr = clean_attr
                    counts[clean_attr] = counts.setdefault(clean_attr, 0) + 1
                    list_attr.append(clean_attr)
                    value_split = re.split(u"[,。、,/]", value.strip())
                    for v in value_split:
                        v = Clean.clean_word(v).strip(u"等").strip(u"收起")
                        title_list.append(v)
                        f.write("\"" + disambi + "\",\"" + clean_attr +
                                "\",\"" + v + "\"" + "\r\n")
            except Exception as e:
                print(e)
                err_count += 1
        title_list = [t.strip(u"\\") for t in title_list]
        title_list = list(set(title_list))
        list_attr = list(set(list_attr))
        sort_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
        with open("./sort_counts.txt", "w", encoding='utf-8') as ouf:
            for i in sort_counts:
                ouf.write(str(i) + "\n")
        with open("./all_attr.txt", "w", encoding='utf-8') as ouf:
            for word_counts in sort_counts:
                if word_counts[1] >= 10:
                    ouf.write(str(word_counts[0]) + "\n")
        with open("./410_baidu/410_title_new.csv", "w",
                  encoding='utf-8') as ouf:
            for i in title_list:
                ouf.write("\"" + i + "\"\r\n")
        with open("./410_baidu/all_attr.txt", "w", encoding='utf-8') as ouf:
            for i in list_attr:
                ouf.write(i + "\n")

        print("err_count: ", err_count)
Example #4
0
def download(videoId):
    try:
        option = Options.AUDIO if request.args.get(
            "option", "") == "audio" else Options.BOTH
        filename = YoutubeDownloader.download(videoId, option)
        if not filename: raise
        Clean.scheduleRemove(filename)
        return send_file(os.path.join(".", filename))
    except YoutubeDownloader.VideoNotFoundException:
        abort(404)
    except Exception as e:
        app.logger.error(e)
        abort(500)
Example #5
0
def clean_disambi_redirect(infile="source", outfile="target"):
    with open(infile) as inf:
        reader = csv.reader(inf)
        err_counts = 0
        with open(outfile, "w") as ouf:
            for line in tqdm(reader):
                if len(line) != 2:
                    err_counts += 1
                    continue
                print(line)
                disambi = Clean.clean_word(line[0], clean_level='disambi')
                redirect = Clean.clean_word(line[1], clean_level='redirect')
                ouf.write("\"" + disambi + "\",\"" + redirect + "\"\n")
            print("err_counts for disambi_redirect:%d" % (err_counts))
Example #6
0
def clean_title_disambi(infile="title_disambi.csv", outfile="title_disambi_out.csv"):
    with open(infile, "r",encoding='utf-8') as inf:
        lines = inf.readlines()
        err_counts = 0
        with open(outfile, "w",encoding='utf-8') as ouf:
            for line in tqdm(lines):
                words = line.strip().split("\",\"")
                if len(words) != 2:
                    err_counts += 1
                    continue
                title = Clean.clean_word(words[0], clean_level='title')
                disambi = Clean.clean_word(words[1], clean_level='disambi')
                ouf.write("\"" + title + "\",\"" + disambi + "\"\r\n")
            print("err_counts for disambi_redirect: ", err_counts)
Example #7
0
File: write.py Project: buque/tidy
    def run(self):
        #本地变量,减少锁并发冲突
        times = 0
        etimes = 0
        print ("Starting ", self.threadName, "...")

        while True:
            try:
                value = Writer.queue.get_nowait()
            except queue.Empty:
                etimes += 1
                if etimes%50 == 0 and times != 0:
                    Writer.writeLock.acquire()
                    Writer.writeSum += times
                    Writer.writeLock.release()
                    times = 0
                    etimes = 0
                    time.sleep(0.5)
                    continue
            else:
                data = Clean.wash(value)
                self.putData(data)
                times += 1

                #多线程数据加锁
                if times%50 == 0:
                    Writer.writeLock.acquire()
                    Writer.writeSum += times
                    Writer.writeLock.release()
                    times = 0
def get_word_list(filename):
    with open(filename, "r") as inf:
        lines = inf.readlines()
        #        print "type line: ", type(lines[0].encode("utf-8"))
        lines = [
            Clean.clean_word(line.decode('utf-8'), clean_level='title')
            for line in lines
        ]
        return lines
def get_title(infile):
    all_title = set([])
    for line in LoadFile.readline(infile):
        title_tmp = Clean.clean_word(line.strip(), clean_level="title")
        title_tmp = title_tmp.strip().strip("\"")
        if title_tmp == "":
            continue
        all_title.add(title_tmp)
    return all_title
Example #10
0
def clean_disambi_subject(infile="disambi_subject.csv",
                          outfile="disambi_subject_out.csv"):
    with open(infile) as inf:
        lines = inf.readlines()
        err_counts = 0
        with open(outfile, "w") as ouf:
            for line in tqdm(lines):
                words = line.strip().split("\",\"")
                if len(words) != 2:
                    err_counts += 1
                    continue
                disambi = Clean.clean_word(
                    words[0].decode('utf-8'),
                    clean_level='disambi').encode('utf-8')
                subject = Clean.clean_word(
                    words[1].decode('utf-8'),
                    clean_level='subject').encode('utf-8')
                ouf.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
            print "err_counts for disambi_redirect: ", err_counts
Example #11
0
 def add_schedule(self):
     self.__clear__()
     set_clean = Schedule(
     )  # calls an instance of schedule to be made and checked
     if set_clean.check():
         print "(+) Time to Clean "
         new_clean = Clean()
     else:
         print "(-) It is not yet time to clean "
     set_clean.save()
     set_clean.show()
     new = Menu()
Example #12
0
def clean_disambi_literal(infile="source", outfile="target"):
    with open(infile) as inf:
        reader = csv.reader(inf)
        err_counts = 0
        with open(outfile, "w") as ouf:
            for line in tqdm(reader):
                if len(line) != 2:
                    err_counts += 1
                    continue
                disambi = Clean.clean_word(line[0], clean_level='disambi')
                literal = Clean.clean_word(line[1], clean_level='literal')
                if literal != '' and disambi != '':
                    if '[朱槿品种]' in disambi:
                        literal = '快乐'
                        disambi = '快乐[[朱槿品种]]'
                    if '"' in literal:
                        literal = literal.replace('"', '""')
                    if '\\' in literal:
                        literal = literal.replace('\\', '')
                    if '"' in disambi:
                        disambi = disambi.replace('"', '""')
                    ouf.write("\"" + disambi + "\",\"" + literal + "\"\n")
            print("err_counts for disambi_redirect:%d" % (err_counts))
Example #13
0
class HelloRPC(object):

	def __init__(self):
		# self.classifier = HackAriba()
		self.clean = Clean()
	def get_sentiment_of_list_of_tweets(self, list_of_tweets):
		list_of_text =[]
		for key, value in list_of_tweets.iteritems():
			list_of_text.append(value)
		print list_of_text
		clean_text = self.clean.clean_data_to_feed_classifier(list_of_text)
		print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
		print clean_text
		print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
		return clean_text
Example #14
0
def __main__():
    nj_municipals = json.load(open('./json/nj_municipals.json'))
    counties = list(nj_municipals.keys())

    if len(sys.argv) == 1:
        url, date = Link()
    elif len(sys.argv) == 2:
        _, date = Link()
        url = sys.argv[1]
    else:
        url = sys.argv[1]
        date = sys.argv[2]
        print(url)
        print(date)

    data = Parse(url, counties)
    total_df = Clean(csv_file, data, date, nj_municipals)
    Update(total_df, csv_file)
    Today(total_df, date, counties, json_file)
Example #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
try:
    import datetime
    from Login import Login
    from readconfig import ReadConfig

    from clean import Clean
    lj = Login()
    dc = Clean()

    _info = ReadConfig()
except Exception as e:
    print('配置文件缺失%s' %e)
    print('输入enter停止')
    _k= input()
    os._exit(0)
def main():
    try:
        print('该程序用于清洗公司名称数据->修改配置文件即可运行')
        print('版本:1.3')
        print('输入enter开始')
        k1=input()
        #读取数据库原始数据
        print('正在清洗数据->请稍后..')
        info1 = _info.get_input("col_name")   #字段名称
        info2 = _info.get_input("table_name")  #表名->有模式需要加.配置完整表名
        sql1 = '''
        select {}
Example #16
0
def get_word_list(in_f):
    #with open(filename, "r") as in_f:
    reader = csv.reader(in_f)
    lines = [Clean.clean_word(line[0], clean_level='literal') for line in reader]
    lines.remove('')
    return lines
Example #17
0
def main():
    with open("source/disambi.csv") as in_f_disambi, open("source/infobox.csv", "r") as in_f_infobox,\
            open('source/literal.csv') as in_f_literal, open("target/disambi_infobox.csv", "w") as out_f:
        literal_list = get_word_list(in_f_literal)
        disambi_reader = csv.reader(in_f_disambi)
        info_lines = in_f_infobox.readlines()
        list_attr = []
        list_value = []
        err_count = 0
        attr_counts = {}
        for (disambi, infobox) in tqdm(zip(disambi_reader, info_lines)):
            disambi = Clean.clean_word(disambi[0], clean_level='disambi')
            if '"' in disambi:
                disambi = disambi.replace('"', '""')
            if infobox != '{}':
                try:
                    #print(json.loads(infobox))
                    info_dict = json.loads(json.loads(infobox).replace("\\", r"\\"))
                    clean_info_dict = {}
                    for attr in info_dict.keys():
                        clean_attr = Clean.clean_word(attr, clean_level='others')
                        if clean_attr not in clean_info_dict.keys():
                            clean_info_dict[clean_attr] = info_dict[attr]
                    for clean_attr in clean_info_dict.keys():
                        value = str(','.join(clean_info_dict[clean_attr])) if clean_info_dict[clean_attr] != [] else None
                        if value:
                            #value = value.replace('\"','').replace("\\",'').replace('"','""')
                            value = value.replace('"','""')
                            attr_counts[clean_attr] = attr_counts.setdefault(clean_attr, 0) + 1 # Collect Attr. Frequency
                            list_attr.append(clean_attr)

                            #######
                            #literal_list.append(value)
                            list_value.append(value)
                            out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + value + "\"" + "\n")
                            #######

                        #value_split = re.split(u"[,。、,/]", value.strip())
                        #for v in value_split:
                            #v = Clean.clean_word(v).strip(u"等").strip(u"收起")
                            #v = v.strip(u"等").strip(u"收起")
                            #if len(v) > 0:
                                #literal_list.append(v)
                                #list_value.append(v)
                                #out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n")

                except Exception as e:
                    print(f'Error:{e},Disambi:{disambi},Infobox:{infobox}')
                    err_count += 1
                    #break                    
        literal_list = [t.replace('\"','').replace("\\",'').replace('"','""') for t in literal_list]
        literal_list = list(set(literal_list))
        list_attr = list(set(list_attr))
        list_value = list(set(list_value))
        sort_counts = sorted(attr_counts.items(),key = lambda x:x[1],reverse = True)
        with open("target/sorted_all_attr.txt", "w") as ouf:
            for i in sort_counts:
                ouf.write(str(i) + "\n")
        with open("target/sorted_filerted_attr.txt", "w") as ouf:
            for word_counts in sort_counts:
                if  word_counts[1] >= 10:
                    ouf.write(str(word_counts[0]) + "\n")
        with open("target/literal.csv", "w") as ouf:
            for i in literal_list:
                ouf.write("\"" + i + "\"\n")
        with open("target/attr.txt", "w") as ouf:
            for i in list_attr:
                ouf.write(i + "\n")
        with open("target/value.csv", "w") as ouf:
            for i in list_value:
                ouf.write("\"" + i + "\"\n")
            
        print("err_count: ", err_count)
Example #18
0
'''
将disambi名称进行清洗,其余不变 
'''
import re
from clean import Clean
from tqdm import tqdm
import csv

with open("source/disambi_attrs.csv") as inf:
    title_dict = {}
    err_count = 0
    reader = csv.reader(inf)
    for line in tqdm(reader):
        curLink = line[-2]
        exterLink = line[-1]
        clean_disambi = Clean.clean_word(line[0], 'disambi')
        if '"' in clean_disambi:
            clean_disambi = clean_disambi.replace('"', '""')
        if curLink == 'http://www.baike.com/wiki/%22':
            clean_disambi = '""[标点符号]'
        if len(line) < 5:
            print(f'\n{line},{len(line)}')
            err_count += 1
            literal = '""'
            abstract = Clean.clean_word(line[1], 'others').strip()
        else:
            literal = Clean.clean_word(line[0], 'title')
            abstract = line[2] if len(line) == 5 else ''.join(line[2:-2])
            abstract = abstract.replace('编辑摘要 ', '').replace('"', "'").strip()
        title_dict[clean_disambi] = [literal, abstract, curLink, exterLink]
    print("Error count:%d" % (err_count))
Example #19
0
#!/usr/bin/env python
# coding=utf-8
import re
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi.csv", "r", encoding='utf-8') as inf:
    title_dict = {}
    count = 0
    lines = inf.readlines()
    for line in tqdm(lines):
        words = line.strip().split("\",\"")
        if len(words) != 4:
            count += 1
        clean_disambi = Clean.clean_word(words[0], 'disambi')
        title_dict[clean_disambi] = words[1:]
    print("Error lines: ", count)
    with open("./410_baidu/410_disambi_new.csv", "w", encoding='utf-8') as ouf:
        for i in title_dict.keys():
            ouf.write("\"" + i + "\",\"" + "\",\"".join(title_dict[i]) +
                      "\r\n")
pricing = os.path.join(BASE_DIR, 'data/pricing_data.csv')
df = pd.read_csv(pricing, encoding='unicode_escape')

# Drops rows will all values missing
df.dropna(how='all', inplace=True)

# Returns the string without non ASCII characters
clean = df.applymap(cd.remove_non_ascii, na_action='ignore')

# Finds pounds, or pounds and pence and returns as a float
clean['price'] = clean['price'].map(cd.pounds_and_pence, na_action='ignore')

clean.boxplot('price')
# plt.savefig('graphs/price_boxplot.png', dpi=400, bbox_inches='tight')
plt.show()

checkin_dates = os.path.join(BASE_DIR, 'collection/dates.json')
with open(checkin_dates) as f:
    checkins = json.load(f)
dates = checkins['checkin']
"""Calculates interquartile range and removes outliers"""
for date in dates:
    date_group = clean.groupby(['date']).get_group(date)
    LL, UL = cd.outlier_limits(date_group['price'])
    outliers = date_group['price'][(date_group['price'] < LL) |
                                   (date_group['price'] > UL)]
    clean.drop(outliers.index, axis=0, inplace=True)

output = os.path.join(BASE_DIR, 'data/cleaned_pricing_data.csv')
clean.to_csv(output, index=False)
Example #21
0
import csv
import nltk
import time
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from progress.bar import Bar
from progress.spinner import Spinner

# Import own files
from clean import Clean

cleaner = Clean()

# Increase csv field size limit
csv.field_size_limit(2**30)

NUM_DOCS = 17153  # for progress bar purposes only

sentences = []
with open("dataset.csv", newline='', encoding='utf-8') as csvfile:

    #Start time
    start = time.time()

    # Start progress bar. max obtained from reading in the excel file and checking number of rows
    indexing_progress_bar = Bar("Reading in documents to train Word2Vec Model",
                                max=NUM_DOCS)

    # Read in CSV dataset and remove headers from consideration
    csv_reader = csv.reader(csvfile)
Example #22
0
import re
import json
import re
from tqdm import tqdm
from clean import Clean


def get_word_list(filename):
    with open(filename, "r", encoding='utf-8') as inf:
        lines = inf.readlines()
        #        print "type line: ", type(lines[0].encode("utf-8"))
        lines = [Clean.clean_word(line, clean_level='title') for line in lines]
        return lines


print(Clean.clean_word(u"\"你好   呀#\"$%^&*@!,。、;:‘’】季    候【"))


def main():
    with open("./410_baidu/410_disambi_infobox.csv", 'r',
              encoding='UTF-8') as inf:
        lines = inf.readlines()
        f = open("./410_baidu/410_disambi_infobox_out.csv",
                 "w",
                 encoding='utf-8')
        list_attr = []
        title_list = get_word_list("./410_baidu/410_title.csv")
        err_count = 0
        counts = {}
        for line in tqdm(lines):
            words = line.strip().split(",")
Example #23
0
# Import own files
from clean import Clean
import vb_encoder

# Global definitions
csv.field_size_limit(2 ** 30)

NUM_DOCS = 17153  # for progress bar purposes only
COURT_RANKINGS = {
    3: ['sg court of appeal', 'sg privy council', 'uk house of lords', 'uk supreme court', 'high court of australia', 'ca supreme court'],
    2: ['sg high court', 'singapore international commercial court', 'hk high court', 'hk court of first instance', 'uk crown court', 'uk court of appeal', 'uk high court', 'federal court of australia', 'nsw court of appeal', 'nsw court of criminal appeal', 'nsw supreme court']
}

# Create instances of imported classes
cleaner = Clean()


def usage():
    print(
        "Usage: "
        + sys.argv[0]
        + " -i dataset-file -d dictionary-file -p postings-file"
    )


# Writes out the total number of documents in the collection to the postings file
# This is basically N, to compute inverse document frequency
def write_collection_size_to_disk(collection_size: int, out_postings):
    # Open our postings file
    f_postings = open(out_postings, "wb")
Example #24
0
 def cleaning(self):
     self.__clear__()
     new_clean = Clean()
     new = Menu()
Example #25
0
import pandas as pd
#import numpy as np

#org_data = pd.read_excel('Sharma_2018-07-24.xlsx', usecols='E:AJ')
#org_data= pd.read_excel('../data/all_data.xlsx', usecols='E:AJ')
#backup_data=org_data

from clean import Clean

clean = Clean(org_data)
clean.clean_data()
org_data = clean.org_data

from imputation import Impute

impute = Impute(org_data)
impute.impute_data()
org_data = impute.org_data
Example #26
0
#!/usr/bin/env python
# coding=utf-8

from collections import defaultdict
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi_subject.csv") as inf:
    lines = inf.readlines()
    #    all_subject = defaultdict(list)
    total_subject = []
    f = open("./410_baidu/disambi_subject.csv", "w")
    for line in tqdm(lines):
        words = line.strip().split(",")
        disambi = Clean.clean_word(words[0].decode('utf-8'),
                                   clean_level='disambi').encode("utf-8")
        subjects = words[1:]
        subjects = [
            Clean.clean_word(s.decode('utf-8'),
                             clean_level="subject").encode("utf-8")
            for s in subjects
        ]
        #        subjects = [s.replace("\"", "").strip("\\") for s in subjects]
        #        subjects = [s.strip() for s in subjects]
        total_subject.extend(subjects)
        for subject in subjects:
            if subject == "":
                continue
            f.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
#        all_subject[disambi].append(subjects)
    f.close()
Example #27
0
from collections import defaultdict
from clean import Clean
from tqdm import tqdm
import csv

with open("source/disambi_topic.csv") as in_f, open("target/disambi_topic.csv",
                                                    "w") as out_f:
    reader = csv.reader(in_f)
    total_topic = []
    for line in tqdm(reader):
        #print(line)
        disambi = line[0]
        topics = []
        for i in line[1].split(','):
            topics.extend(i.split())
        disambi = Clean.clean_word(disambi, clean_level='disambi')
        topics = [Clean.clean_word(s, clean_level="topic") for s in topics]
        total_topic.extend(topics)
        for topic in topics:
            if topic == "":
                continue
            if '[朱槿品种]' in disambi:
                disambi = '快乐[[朱槿品种]]'
            if '"' in disambi:
                disambi = disambi.replace('"', '""')
            if '"' in topic:
                topic = topic.replace('"', '""')
            out_f.write("\"" + disambi + "\",\"" + topic + "\"\n")
    total_topic = list(set(total_topic))
    print("Total topics:%d " % (len(total_topic)))
#!/usr/bin/env python
# coding=utf-8
import re
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi.csv") as inf:
    title_dict = {}
    count = 0
    lines = inf.readlines()
    for line in tqdm(lines):
        words = line.strip().split("\",\"")
        if len(words) != 4:
            count += 1
        clean_disambi = Clean.clean_word(words[0].decode('utf-8'), 'disambi')
        title_dict[clean_disambi] = words[1:]
    print "Error lines: ", count
    with open("./410_baidu/410_disambi_new.csv", "w") as ouf:
        for i in title_dict.keys():
            ouf.write("\"" + i.encode('utf-8') + "\",\"" +
                      "\",\"".join(title_dict[i]) + "\r\n")
Example #29
0
 def clean(sentence):
     return Clean(sentence).clean_html_and_js_tags().clean_text().text
Example #30
0
#!/usr/bin/env python
# coding=utf-8

from collections import defaultdict
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi_subject.csv", "r", encoding='utf-8') as inf:
    lines = inf.readlines()
    #    all_subject = defaultdict(list)
    total_subject = []
    f = open("./410_baidu/disambi_subject.csv", "w", encoding='utf-8')
    for line in tqdm(lines):
        words = line.strip().split(",")
        disambi = Clean.clean_word(words[0], clean_level='disambi')
        subjects = words[1:]
        subjects = [
            Clean.clean_word(s, clean_level="subject") for s in subjects
        ]
        #        subjects = [s.replace("\"", "").strip("\\") for s in subjects]
        #        subjects = [s.strip() for s in subjects]
        total_subject.extend(subjects)
        for subject in subjects:
            if subject == "":
                continue
            f.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
#        all_subject[disambi].append(subjects)
    f.close()
    total_subject = list(set(total_subject))
    print("Total subjects: ", len(total_subject))
    with open("./410_baidu/all_subject.csv", "w", encoding='utf-8') as ouf: