Esempio n. 1
0
    def dump_to_sqlite_db(self, output_db, input_fields, top_level=False):
        def replace_none(s):
            if s is None:
                return 'NULL'
            return s

        tweet_parser = TweetParser()
        column_str = ','.join([column for column in input_fields]).replace('.','__')
        question_marks = ','.join(['?' for column in input_fields])

        con = sqlite3.connect(output_db)
        cur = con.cursor()
        cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

        insert_list = []
        # batch insert if more than 10k tweets
        for count, tweet in enumerate(self.get_iterator()):
            if top_level:
                ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
            else:
                ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields)
            row = [replace_none(col_val[1]) for col_val in ret]
            insert_list.append(tuple(row))
            if (count % 10000) == 0:
                cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
                con.commit()
                insert_list = []
        if count < 10000:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
        con.close()
Esempio n. 2
0
def make_sqlite_db_json(output, input_file, fields):
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)

    column_str = ','.join([column for column in fields]).replace('.','__')
    question_marks = ','.join(['?' for column in fields])
    con = sqlite3.connect(output)
    cur = con.cursor()
    cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))

    json_col = JsonCollection(input_file)
    insert_list = []
    tp = TweetParser()

    for count,tweet in enumerate(json_col.get_iterator()):
        ret = tp.parse_columns_from_tweet(tweet, fields)
        row = [replace_none(col_val[1]) for col_val in ret]
        insert_list.append(tuple(row))

        if (count % 10000) == 0:
            cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
            con.commit()
            insert_list = []

    if count < 10000:
        cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
        con.commit()

    con.close()
    logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
Esempio n. 3
0
    def dump_to_csv(self, output_csv, input_fields, write_header=True, top_level=False, mode='a', encoding='utf-8', compression=None):
        if compression == 'bz2':
            mode = binary_mode(mode)
            filehandle = bz2.open(output_csv, mode)
        elif compression == 'gzip':
            mode = binary_mode(mode)
            filehandle = gzip.open(output_csv, mode)
        else:
            filehandle = open(output_csv, mode)
            
        writer = csv.writer(filehandle)
        if write_header:
            writer.writerow(input_fields)
        tweet_parser = TweetParser()

        for tweet in self.get_iterator():
            if top_level:
                ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
            else:
                ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields)
            ret_values = [col_val[1] for col_val in ret]
            writer.writerow(ret_values)
        filehandle.close()