Beispiel #1
0
 def __WriteDict(dic, collection, FieldK, FieldV):
     db = MongoDB.getConnection('mining')
     docs = []
     for k,v in dic.iteritems():
         doc = {}
         doc[FieldK] = k
         doc[FieldV] = v
         docs.append(doc)
     db[collection].insert(docs)
Beispiel #2
0
 def __ReadDict(dic, collection, FieldK, FieldV):
     db = MongoDB.getConnection('mining')
     for doc in db[collection].find():
         k = doc[FieldK]
         v = doc[FieldV]
         if (FieldK == "term"):
             k = k.encode('utf-8')
         if (FieldV == "term"):
             v = v.encode('utf-8')
         dic[k] = v
Beispiel #3
0
 def __init__(self, config, nodeName, loadFromDB = False):
     self.node = config.GetChild(nodeName)
     self.trained = loadFromDB
     GlobalInfo.Init(config, "__global__", loadFromDB)
     #get data source
     data_source = self.node.GetChild("data_source")
     dbname = data_source.GetChild('db').GetValue()
     self.collection = data_source.GetChild('collection').GetValue()
     self.field = data_source.GetChild('field').GetValue()
     #self.field = self.field.encode('utf-8')
     self.db = MongoDB.getConnection(dbname)
Beispiel #4
0
 def __init__(self, config, nodeName, loadFromFile = False):
     self.curNode = config.GetChild(nodeName)
     self.rate = float(self.curNode.GetChild("rate").GetValue())
     self.method = self.curNode.GetChild("method").GetValue()
     self.modelPath = self.curNode.GetChild("model_path").GetValue()
     self.people_tag_collection = self.curNode.GetChild('people_tag').GetValue()
     self.blackList = {}
     dbname = self.curNode.GetChild("db").GetValue()
     self.db = MongoDB.getConnection(dbname)
     self.trained = loadFromFile
     if (loadFromFile):
         f = open(self.modelPath, "r")
         for line in f:
             self.blackList[int(line)] = 1
Beispiel #5
0
            doc = doc.decode("gbk").encode("utf-8")
        except:
            page_id  += process_num
            continue
        page_id  += process_num
        soup = BeautifulSoup(doc)
        word = soup.find('h1', "title")
        if word:
            #baike.append({'title':word.string, 'url':url, 'html':doc})
            #if not db.word_dic.find_one({'word':word.string}):
            words.append({'word':word.string, 'len':len(word.string)})
        matchs = soup.findAll(href=re.compile('^/view/\d+.htm'))
        for match in matchs:
            #if match.string:
            if match.string and not db.word_dic.find_one({'word':match.string}):
                words.append({'word':match.string, 'len':len(match.string)})
        if len(words) >= 10:
            db.word_dic.insert(words)
            words = []
            #db.baike.insert(baike)
            #baike = []

if __name__=="__main__":
    db = MongoDB.getConnection('mining')
    process_num = 1
    startindex = 1
    for i in range(startindex, process_num+startindex):
        p = Process(target=son,args=(process_num, i, db))
        p.start()