def dedup_tags(): engine.query('''DELETE FROM tag USING tag t WHERE tag.status_id = t.status_id AND tag.tag = t.tag AND tag.category = t.category AND tag.regex = t.regex AND tag.id < t.id''')
def dump_hashtag(tag): data = [] status_tbl = engine['status'].table user_tbl = engine['user'].table q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) q = q.join(hashtags_tbl, status_tbl.c.id == hashtags_tbl.c.status_id) q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True) q = q.where(hashtags_tbl.c.text.ilike(tag)) q = q.order_by(hashtags_tbl.c.status_id.asc()) statuses = [] for row in engine.query(q): data.append(row) #data.append(json.loads(row['raw_json'])) #for json_file in os.listdir('dumps'): # print json_file, len(statuses), len(data) # #min_id = int(json_file.split('.', 1)[0].split('_', 1)[-1]) # fh = open('dumps/%s' % json_file, 'rb') # ss = json.load(fh) # for s in ss: # if s.get('id') in statuses: # data.append(s) log.info("Saving file...") fh = open('dump_%s.json' % tag, 'wb') print len(data) json.dump(data, fh, cls=JSONEncoder) fh.close() return True
def classify_tweets(rules): regexen = [d.get('regex') for (a, d) in rules.items()] offsets = get_offsets(regexen) delete_old_tags(regexen) status_tbl = engine['status'].table user_tbl = engine['user'].table max_id = 0 q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) fields = [status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name, user_tbl.c.screen_name] q = sql.select(fields, from_obj=q, use_labels=True) dt = datetime.utcnow() - timedelta(days=28) q = q.where(sql.and_(status_tbl.c.lang == 'de', status_tbl.c.id >= min(offsets.values()), status_tbl.c.created_at > dt)) q = q.order_by(status_tbl.c.id.asc()) offset = 0 while True: engine.begin() lq = q.limit(PAGE_SIZE).offset(offset) offset += PAGE_SIZE print offset, PAGE_SIZE has_records = False for i, status in enumerate(engine.query(lq)): has_records = True max_id = max(max_id, status.get('status_id')) handle_status(status, rules, offsets) if not has_records: break for regex in regexen: offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex']) engine.commit() dedup_tags()
def classify_tweets(): rules, regexen = get_rules() offsets = get_offsets(regexen) delete_old_tags(regexen) status_tbl = engine['status'].table user_tbl = engine['user'].table engine.begin() max_id = 0 q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True) q = q.where(sql.and_(status_tbl.c.lang == 'de', status_tbl.c.id >= min(offsets.values()))) q = q.order_by(status_tbl.c.id.desc()) for i, status in enumerate(engine.query(q)): max_id = max(max_id, status.get('status_id')) for (field, rule), data in rules.items(): if offsets.get(data.get('regex')) > status.get('status_id'): continue m = rule.search(unicode(status.get(field)).lower()) #print [field,data.get('regex'), m] if m is not None: #print [field, data.get('regex'), m] data['status_id'] = status['status_id'] tag_table.insert(data) if i % 1000 == 0: print 'Processed: ', i for regex in regexen: offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex']) engine.commit() dedup_tags()
def classify_tweets(rules): regexen = [d.get('regex') for (a, d) in rules.items()] offsets = get_offsets(regexen) delete_old_tags(rules) q = text(""" INSERT INTO tag (category, tag, status_id, classified_at, regex) SELECT :category, :tag, s.id, NOW(), :regex FROM status s LEFT JOIN tag_offset tgo ON tgo.regex = :regex LEFT JOIN "user" u ON s.user_id = u.id WHERE (s.id > tgo.status_id OR tgo.status_id IS NULL) AND (s.text ~* :regex OR u.name ~* :regex OR u.screen_name ~* :regex) AND s.lang = 'de' AND s.created_at > NOW() - INTERVAL '28 days' """) offsets_q = text(""" INSERT INTO tag_offset (regex, status_id) SELECT :regex, t.status_id FROM tag t WHERE t.regex = :regex ORDER BY t.status_id DESC LIMIT 1 """) for rule in rules.values(): print rule engine.begin() engine.query(q, **rule) offset_table.delete(regex=rule['regex']) engine.query(offsets_q, regex=rule['regex']) engine.commit() dedup_tags()
def classify_tweets(): rules = get_rules() status_tbl = engine['status'].table user_tbl = engine['user'].table #engine.begin() q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True) q = q.where(user_tbl.c.lang == 'de') q = q.order_by(status_tbl.c.id.desc()) for i, status in enumerate(engine.query(q)): for (field, rule), (category, tag) in rules.items(): m = rule.search(unicode(status.get(field)).lower()) if m is not None: tag_status(status, category, tag) if i % 1000 == 0: print 'Processed: ', i #engine.commit() dedup_tags()
def classify_tweets(rules): regexen = [d.get('regex') for (a, d) in rules.items()] offsets = get_offsets(regexen) delete_old_tags(regexen) status_tbl = engine['status'].table user_tbl = engine['user'].table max_id = 0 q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) fields = [ status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name, user_tbl.c.screen_name ] q = sql.select(fields, from_obj=q, use_labels=True) dt = datetime.utcnow() - timedelta(days=28) q = q.where( sql.and_(status_tbl.c.lang == 'de', status_tbl.c.id >= min(offsets.values()), status_tbl.c.created_at > dt)) q = q.order_by(status_tbl.c.id.asc()) offset = 0 while True: engine.begin() lq = q.limit(PAGE_SIZE).offset(offset) offset += PAGE_SIZE print offset, PAGE_SIZE has_records = False for i, status in enumerate(engine.query(lq)): has_records = True max_id = max(max_id, status.get('status_id')) handle_status(status, rules, offsets) if not has_records: break for regex in regexen: offset_table.upsert({ 'regex': regex, 'status_id': max_id }, ['regex']) engine.commit() dedup_tags()
def geocode_locations(): q = """SELECT DISTINCT TRIM(LOWER(u.location)) AS loc FROM "user" u LEFT OUTER JOIN locations lx ON lx.location = TRIM(LOWER(u.location)) WHERE u.location IS NOT NULL AND lx.location IS NULL;""" for location in list(engine.query(q)): geocode_location(location)