def do_dependency_parse(fil): u = TwitterUser() u.populate_tweets_from_file(fil,do_tokenize=False) out_file_name = fil.replace(".json","").replace(".gz","").replace("/json/","/dep_parse/") print out_file_name if len(u.tweets) == 0: os.utime(out_file_name) return 'empty, success' data = dependency_parse_tweets(TWEEBOPARSER_LOC,u.tweets,out_file_name) return 'completed'
def do_dependency_parse(fil): u = TwitterUser() u.populate_tweets_from_file(fil, do_tokenize=False) out_file_name = fil.replace(".json", "").replace(".gz", "").replace( "/json/", "/dep_parse/") print out_file_name if len(u.tweets) == 0: os.utime(out_file_name) return 'empty, success' data = dependency_parse_tweets(TWEEBOPARSER_LOC, u.tweets, out_file_name) return 'completed'
def gen_dp(data): of_id, uid_list = data json_of_name = os.path.join(JSON_OUTPUT_DIRECTORY, str(of_id) + ".json.gz") dp_of_name = os.path.join(DP_OUTPUT_DIRECTORY, str(of_id) + ".dp") reader = [ z.decode("utf8") for z in gzip.open(json_of_name).read().splitlines() ] tweets_to_write = [Tweet(json.loads(l), do_tokenize=False) for l in reader] if not os.path.exists(dp_of_name + ".gz"): print 'DOING DP', dp_of_name try: dp = dependency_parse_tweets(TWEEBOPARSER_LOC, tweets_to_write, dp_of_name) except: print 'FAILED DP STUFF: ', dp_of_name
def gen_dp(json_input_filename, dp_output_filename): """ This function generates a dependency parse file (ending in dp) that will be used to create features for the identity extractor model. This process takes by far the longest of any process in this file. It calls out to a shell script that runs tweeboparser. :param json_input_filename: A (possibly cleaned, possibly gzipped) JSON file :param dp_output_filename: An output filename for the dependency parse :return: """ reader = [z.decode("utf8") for z in gzip.open(json_input_filename).read().splitlines()] tweets_to_write = [Tweet(json.loads(l),do_tokenize=False) for l in reader] if not os.path.exists(dp_output_filename+".gz"): print 'DOING DP', dp_output_filename try: dp = dependency_parse_tweets(TWEEBOPARSER_LOC,tweets_to_write, dp_output_filename) except: print 'FAILED DP STUFF: ', dp_output_filename
def gen_dp(json_input_filename, dp_output_filename): """ This function generates a dependency parse file (ending in dp) that will be used to create features for the identity extractor model. This process takes by far the longest of any process in this file. It calls out to a shell script that runs tweeboparser. :param json_input_filename: A (possibly cleaned, possibly gzipped) JSON file :param dp_output_filename: An output filename for the dependency parse :return: """ reader = [ z.decode("utf8") for z in gzip.open(json_input_filename).read().splitlines() ] tweets_to_write = [Tweet(json.loads(l), do_tokenize=False) for l in reader] if not os.path.exists(dp_output_filename + ".gz"): print 'DOING DP', dp_output_filename try: dp = dependency_parse_tweets(TWEEBOPARSER_LOC, tweets_to_write, dp_output_filename) except: print 'FAILED DP STUFF: ', dp_output_filename
__author__ = 'kjoseph' from utility_code.util import * from twitter_dm import dependency_parse_tweets import codecs tweet_id_to_tweet = get_original_tweet_data() all_tweets = [v.tweet for v in tweet_id_to_tweet.values()] parse_data = dependency_parse_tweets( TWEEBOPARSER_LOCATION, all_tweets, 'processed_data/dependency_parsed_tweets.txt', gzip_final_output=False)[:-1] write_dep_parse_with_tweet_ids_file = codecs.open("dep_parse_w_ids.txt", "w", "utf8") for i, parse in enumerate(parse_data): write_dep_parse_with_tweet_ids_file.write(str(all_tweets[i].id) + "\n") write_dep_parse_with_tweet_ids_file.write(parse) write_dep_parse_with_tweet_ids_file.write("\n\n") write_dep_parse_with_tweet_ids_file.close()
__author__ = 'kjoseph' from utility_code.util import * from twitter_dm import dependency_parse_tweets import codecs tweet_id_to_tweet = get_original_tweet_data() all_tweets = [v.tweet for v in tweet_id_to_tweet.values()] parse_data = dependency_parse_tweets(TWEEBOPARSER_LOCATION, all_tweets, 'processed_data/dependency_parsed_tweets.txt', gzip_final_output=False)[:-1] write_dep_parse_with_tweet_ids_file = codecs.open("dep_parse_w_ids.txt","w","utf8") for i, parse in enumerate(parse_data): write_dep_parse_with_tweet_ids_file.write(str(all_tweets[i].id) + "\n") write_dep_parse_with_tweet_ids_file.write(parse) write_dep_parse_with_tweet_ids_file.write("\n\n") write_dep_parse_with_tweet_ids_file.close()