#!/usr/bin/python from mongodb import mongodb import sys import filters db = sys.argv[1] mdb = mongodb.mongodb('localhost', 27017, db) i = 0 with open(db + '_domaine.txt', 'w') as fw: fw.write('**** *domaine\n') for domaine in mdb.selectall('metadatas'): fw.write(domaine['domaine']) fw.write('\n') with open(db + '_metadatas.txt', 'w') as fw: fw.write('**** *metadata\n') for domaine in mdb.selectall('metadatas'): meta = domaine['meta'] for filt in filters.filters_metadata: meta = meta.replace(filt, '') meta = meta.replace(filt.swapcase(), '') fw.write(meta.encode('ascii', 'ignore')) fw.write('\n') fw.close()
def __init__(self, db): self.mdb = mongodb.mongodb('localhost', 27017, db) self.white_list = [] self.white_domaine = [ 'msn.com', 'google.com', 'wikipedia.fr', 'free.fr', 'linkedin.com' ]
def __init__(self, host, db): self.db = mongodb.mongodb(host, 27017, db) self.networks = {}
return f.read() ret = readCSV('dict.txt') s = [] for ele in ret.split('\n'): s.append(ele) ''' 抽取并按月份合并关键词 ''' cols = ['law_detail', 'justice_detail', 'rule_detail', 'dept_detail', 'industry_detail', 'party_detail'] rst = [[], [], [], [], [], [], [], [], [], [], [], []] dic = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] for i in range(0, len(cols)): for ele in mongodb().find_all('pkulaw', cols[i]): # 以下循环用于简化12个条件分支 for j in range(0, 12): if ele['pub_date'] != '' and int(ele['pub_date'].split('.')[1]) > j and int(ele['pub_date'].split('.')[1]) < j + 2 and int(ele['pub_date'].split('.')[2]) < 50: for e in jieba.analyse.extract_tags(getContent(str(ele['content'][0])) , topK=30, withWeight=False, allowPOS=()): if e in s: rst[j].append(e) ''' 按月份统计词频 ''' for i in range(0, len(rst)): for e in rst[i]: if e not in dic[i].keys(): dic[i][e] = 1 else: dic[i][e] += 1
import sys import glob import os from mongodb import mongodb pathdirectory=glob.glob(sys.argv[1]) db=sys.argv[2] mdb=mongodb.mongodb('localhost',27017,db) with open(db+'_cleaned.log','w') as fw: for name_file in pathdirectory: fileName, fileExtension =os.path.splitext(name_file) tokens=fileName.split('/') domaine=tokens[len(tokens)-1] results=mdb.selectbycreteria('domaine',domaine,'new_domaines') for result in results: fw.write(result['ip']+';'+result['domaine']+'\n') fw.close()
''' ''' # 获取并存储法律 for ele in mongodb().find_all('pkulaw1', 'law'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'law_details')) # 获取并存储行政法规 for ele in mongodb().find_all('pkulaw1', 'rule'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'rule_details')) # 获取并存储司法解释 for ele in mongodb().find_all('pkulaw1', 'justice'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'justice_details')) ''' # 获取并存储部门规章 for ele in mongodb().find_all('pkulaw1', 'dept'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'dept_details')) # 获取并存储党内法规 for ele in mongodb().find_all('pkulaw1', 'party'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'party_details')) ''' # 获取并存储团体规定 for ele in mongodb().find_all('pkulaw', 'group'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'group_detail')) ''' # 获取并存储行业规定 for ele in mongodb().find_all('pkulaw1', 'industry'): print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'industry_details'))
def __init__(self, cols): self.__mongodb = mongodb() self.__sets = list() self.__realSets = list() self.__indexs = list() self.__cols = cols
def __init__(self): self.__browser_instance = browser(self.__url, self.__exe_path, self.__service_log_path) self.__mongo = mongodb()
def __init__(self,db): self.mdb=mongodb.mongodb('localhost',27017,db) self.white_list=[] self.white_domaine=['msn.com','google.com','wikipedia.fr','free.fr','linkedin.com']