if (out_date_fn!=None): out_date_file = open(out_date_fn, 'w') if (out_msg_fn!=None): out_file = codecs.open(out_msg_fn, 'w', encoding='ascii', errors='ignore') if (out_cnt_fn!=None): out_cnt_file = codecs.open(out_cnt_fn, 'w', encoding='ascii', errors='ignore') # Create UTF8 -> ascii hash rewrite_hash = create_utf8_rewrite_hash() # Loop over files in input list for input_file in list_file: input_file = input_file.rstrip() print 'Reading in file: {}'.format(input_file) transactions = tt.load_tweets(input_file) print 'Done' for key in transactions.keys(): value = transactions[key] if (key_search!=None): if (value.has_key(ky1)): if (value[ky1] != ky2): continue else: continue if (out_date_fn!=None): dt = value['date'] fmt_in = '%a %b %d %H:%M:%S %Z %Y' fmt_out = '%Y%m%d' dt_python = datetime.strptime(dt, fmt_in)
outfile = os.path.join(destdir, out_fn) print 'outfile name is : {}'.format(outfile) if (os.path.exists(outfile)): print "Outfile already exists, skipping ..., delete to regenerate: {}".format( outfile) continue tmpfile = os.path.join(tmpdir, out_fn) cmd = "scripts/tweet_to_dict.py --in {} --out {} --verbose {} {}".format( fn, tmpfile, debug, additional_args) print "Running command: {}".format(cmd) os.system(cmd) # Now add simple metadata -- at tags, hashtags, links print "Adding simple metadata ..." transactions = tt.load_tweets(tmpfile) os.unlink(tmpfile) tsm.add_simple_metadata(transactions, debug) # Normalize print "Performing normalization ..." tnm.normalize_msgs(transactions, debug) # Language id on text # print "Performing language recognition ..." # tlid.add_lid(transactions, debug) # Save it print "Saving to serialized file ... ", print "outfile: {}".format(outfile) tt.save_tweets(transactions, outfile)
# Parse input command line options parser = OptionParser() parser.add_option("--input_file", help="input pickled file of tweets", metavar="FILE") parser.add_option("--output_file", help="output pickled file of tweets", metavar="FILE") parser.add_option("--verbose", help="verbosity > 0 -> debug mode", metavar="FILE", default=0) (Options, args) = parser.parse_args() input_file = Options.input_file output_file = Options.output_file debug = int(Options.verbose) if (input_file == None or output_file == None): print "Need to specify input and output files -- run with --help for syntax" exit(1) print 'Reading in file: {}'.format(input_file) transactions = tt.load_tweets(infile) print 'Done' add_simple_metadata(transactions, debug) outfile = open(output_file, 'w') pickle.dump(transactions, outfile) outfile.close()
if (os.path.exists(outfile_pckl)): print 'Graph {} already exists, exiting ...'.format(outfile_pckl) exit(0) listfile = open(listfn, 'r') G = nx.DiGraph() for fn in listfile: fn = fn.rstrip() print "Loading file: {}".format(fn) sys.stdout.flush() # Load in tweets fn = fn.rstrip() xact = tt.load_tweets(fn) # Add to graph for ky in xact.keys(): val = xact[ky] # user node user = '******' + val['userid'].lower() if (not G.has_node(user)): G.add_node(user) G.node[user]['type'] = 'user' # Mention list if (val.has_key('mentions')): mention_list = copy.deepcopy(val['mentions']) else:
print "predicted language lui: {}".format(lang) print # Main driver: command line interface if __name__ == '__main__': # Parse input command line options parser = OptionParser() parser.add_option("--input_file", help="input pickled file of tweets", metavar="FILE") parser.add_option("--output_file", help="output pickled file of tweets", metavar="FILE") parser.add_option("--verbose", help="verbosity > 0 -> debug mode", metavar="FILE", default=0) (Options, args) = parser.parse_args() input_file = Options.input_file output_file = Options.output_file debug = int(Options.verbose) if (input_file==None or output_file==None): print "Need to specify input and output files -- run with --help for syntax" exit(1) print 'Reading in file: {}'.format(input_file) transactions = tt.load_tweets(infile) print 'Done' add_lid(transactions, debug) outfile = open(output_file, 'w') pickle.dump(transactions, outfile) outfile.close()