Esempio n. 1
0
    if (out_date_fn!=None):
        out_date_file = open(out_date_fn, 'w')
    if (out_msg_fn!=None):
        out_file = codecs.open(out_msg_fn, 'w', encoding='ascii', errors='ignore')
    if (out_cnt_fn!=None):
        out_cnt_file = codecs.open(out_cnt_fn, 'w', encoding='ascii', errors='ignore')

    # Create UTF8 -> ascii hash
    rewrite_hash = create_utf8_rewrite_hash()

    # Loop over files in input list
    for input_file in list_file:
        input_file = input_file.rstrip()

        print 'Reading in file: {}'.format(input_file)
        transactions = tt.load_tweets(input_file)
        print 'Done'

        for key in transactions.keys():
            value = transactions[key]
            if (key_search!=None):
                if (value.has_key(ky1)):
                    if (value[ky1] != ky2):
                        continue
                else:
                    continue
            if (out_date_fn!=None):
                dt = value['date']
                fmt_in = '%a %b %d %H:%M:%S %Z %Y'
                fmt_out = '%Y%m%d'
                dt_python = datetime.strptime(dt, fmt_in)
Esempio n. 2
0
    outfile = os.path.join(destdir, out_fn)
    print 'outfile name is : {}'.format(outfile)

    if (os.path.exists(outfile)):
        print "Outfile already exists, skipping ..., delete to regenerate: {}".format(
            outfile)
        continue
    tmpfile = os.path.join(tmpdir, out_fn)
    cmd = "scripts/tweet_to_dict.py --in {} --out {} --verbose {} {}".format(
        fn, tmpfile, debug, additional_args)
    print "Running command: {}".format(cmd)
    os.system(cmd)

    # Now add simple metadata -- at tags, hashtags, links
    print "Adding simple metadata ..."
    transactions = tt.load_tweets(tmpfile)
    os.unlink(tmpfile)
    tsm.add_simple_metadata(transactions, debug)

    # Normalize
    print "Performing normalization ..."
    tnm.normalize_msgs(transactions, debug)

    # Language id on text
    # print "Performing language recognition ..."
    # tlid.add_lid(transactions, debug)

    # Save it
    print "Saving to serialized file ... ",
    print "outfile: {}".format(outfile)
    tt.save_tweets(transactions, outfile)
Esempio n. 3
0
    # Parse input command line options
    parser = OptionParser()
    parser.add_option("--input_file",
                      help="input pickled file of tweets",
                      metavar="FILE")
    parser.add_option("--output_file",
                      help="output pickled file of tweets",
                      metavar="FILE")
    parser.add_option("--verbose",
                      help="verbosity > 0 -> debug mode",
                      metavar="FILE",
                      default=0)
    (Options, args) = parser.parse_args()
    input_file = Options.input_file
    output_file = Options.output_file
    debug = int(Options.verbose)
    if (input_file == None or output_file == None):
        print "Need to specify input and output files -- run with --help for syntax"
        exit(1)

    print 'Reading in file: {}'.format(input_file)
    transactions = tt.load_tweets(infile)
    print 'Done'

    add_simple_metadata(transactions, debug)

    outfile = open(output_file, 'w')
    pickle.dump(transactions, outfile)
    outfile.close()
Esempio n. 4
0
if (os.path.exists(outfile_pckl)):
    print 'Graph {} already exists, exiting ...'.format(outfile_pckl)
    exit(0)

listfile = open(listfn, 'r')
G = nx.DiGraph()
for fn in listfile:

    fn = fn.rstrip()

    print "Loading file: {}".format(fn)
    sys.stdout.flush()

    # Load in tweets
    fn = fn.rstrip()
    xact = tt.load_tweets(fn)

    # Add to graph
    for ky in xact.keys():
        val = xact[ky]
        
        # user node
        user = '******' + val['userid'].lower()
        if (not G.has_node(user)):
            G.add_node(user)
            G.node[user]['type'] = 'user'

        # Mention list
        if (val.has_key('mentions')):
            mention_list = copy.deepcopy(val['mentions'])
        else:
Esempio n. 5
0
            print "predicted language lui: {}".format(lang)
            print


# Main driver: command line interface
if __name__ == '__main__':

    # Parse input command line options
    parser = OptionParser()
    parser.add_option("--input_file", help="input pickled file of tweets", metavar="FILE")
    parser.add_option("--output_file", help="output pickled file of tweets", metavar="FILE")
    parser.add_option("--verbose", help="verbosity > 0 -> debug mode", metavar="FILE", default=0)
    (Options, args) = parser.parse_args()
    input_file = Options.input_file
    output_file = Options.output_file
    debug = int(Options.verbose)
    if (input_file==None or output_file==None):
        print "Need to specify input and output files -- run with --help for syntax"
        exit(1)

    print 'Reading in file: {}'.format(input_file)
    transactions = tt.load_tweets(infile)
    print 'Done'

    add_lid(transactions, debug)

    outfile = open(output_file, 'w')
    pickle.dump(transactions, outfile)
    outfile.close()