def getdf_from(items): df = {} for item in items: feat = extractd.getngram(item['text']) for w in feat: try: df[w].add(item["id"]) except KeyError: df[w] = set([ item['id'] ]) #return [ (w, len(f)) for w, f in df.items() ] return df
def feature(t_begin, t_end, screen_names): ngram = {} table = {} for j, u in enumerate(screen_names): query = { 'created_at': { '$gt': t_begin, '$lt': t_end }, 'screen_name': u } for item in db.find(query): text = item['text'] id = item['id'] try: replied_id = item['in_reply_to_status_id'] if replied_id: for ii in db.find({ 'id': replied_id }): text += u'。%s' % ii['text'] except KeyError: pass """ feats = bow.bagofwords(text) for f in feats: print(' '.join(f)) continue """ feat = extractd.getngram(text) for w in set(feat): if len(unicode(w)) < 2: continue if len(patterns.hiragana.findall(unicode(w))[0]) == len(unicode(w)): continue if w in patterns.english_words: continue if not w in ngram: ngram[w] = {} utils.count(ngram[w], u) try: #table[w].append(text) table[w].add(id) except KeyError: #table[w] = [ text ] table[w] = set([ id ]) tags = extractd.gethashtags(item) for t in set(tags): if not t in ngram: ngram[t] = {} utils.count(ngram[t], u) try: #table[t].append(text) table[t].add(id) except KeyError: #table[t] = [ text ] table[t] = set([ id ]) urls = extractd.geturls(item) for l in set(urls): if not l in ngram: ngram[l] = {} utils.count(ngram[l], u) try: #table[l].append(text) table[l].add(id) except KeyError: #table[l] = [ text ] table[l] = set([ id ]) print('%d/%d' % (j, len(screen_names))) return ngram, table