Example #1
0
def process_batch(cur, geo=False, fsw=False, stem=False):
    st = datetime.now()
    i = 0
    r = []
    l = cur.count()
    for doc in cur:
        if i == 0:
            stt = doc['created']
        t = text_process(doc, geo=geo, filter_sw=fsw, stem=stem)
        r.append({'words': t[0].split(), 'created_at': doc['created'], 'geo': t[1]})
        i += 1
        progress(i, l, skip=100)
    end = doc['created']
    print '\nretrieval and processing took', datetime.now() - st
    return r, stt, end
Example #2
0
def process_batch(cur, geo=False, fsw=False, stem=False):
    st = datetime.now()
    i = 0
    r = []
    l = cur.count()
    for doc in cur:
        if i == 0:
            stt = doc['created']
        t = text_process(doc, geo=geo, filter_sw=fsw, stem=stem)
        r.append({
            'words': t[0].split(),
            'created_at': doc['created'],
            'geo': t[1]
        })
        i += 1
        progress(i, l, skip=100)
    end = doc['created']
    print '\nretrieval and processing took', datetime.now() - st
    return r, stt, end
Example #3
0
f = open('assets/tw_ht_corpus_2.txt', 'a')
p = MDB('tweets')
cols = p.client['tweets'].collection_names()
cols.remove('SPB')
cols.remove('EKB')
cols.remove('Moscow')
print cols
i = 0

counts = []
for c in cols:
    ml = p.client['tweets'][c].find()
    counts.append(ml.count())
    
total = sum(counts)
print 'total:', total, 'documents'

for c in cols:
    ml = p.client['tweets'][c].find()
    for t in ml:
        try:
            dt = text_process(t)[0]
            progress(i, total)
            if dt:
                f.write(dt + '\n')
        except Exception as e:
            print e
        finally:
            i += 1
Example #4
0
f = open('assets/tw_ht_corpus_2.txt', 'a')
p = MDB('tweets')
cols = p.client['tweets'].collection_names()
cols.remove('SPB')
cols.remove('EKB')
cols.remove('Moscow')
print cols
i = 0

counts = []
for c in cols:
    ml = p.client['tweets'][c].find()
    counts.append(ml.count())

total = sum(counts)
print 'total:', total, 'documents'

for c in cols:
    ml = p.client['tweets'][c].find()
    for t in ml:
        try:
            dt = text_process(t)[0]
            progress(i, total)
            if dt:
                f.write(dt + '\n')
        except Exception as e:
            print e
        finally:
            i += 1