def process_lines(dump: io.TextIOWrapper, stats: Mapping, args: argparse.Namespace, shared: dict, m3twitter: M3Twitter) -> Iterator[list]: """It checks for each line (user) if the number of tweets is above a certain minimum and if it is the case, it uses transform_jsonl_object to get a particular dict to perform inference later on """ for user in dump: stats['performance']['input']['users'] += 1 # trasform the object only if the user reached the minimum number of tweets if 'tweets' in user and int(user['tweets']) >= args.min_tweets: stats['performance']['input']['to_infer'] += 1 shared[user['id_str']] = init_user(user) # handle empty profile_image_url_https if user['profile_image_url_https'] == "": user['default_profile_image'] = True stats['performance']['input']['img_errors'] += 1 yield m3twitter.transform_jsonl_object(user)
def build_m3(username): # Gather info m3twitter = M3Twitter() res = m3twitter.infer_screen_name(username) # Separate profile elements prfs = [] for attr in res['output']: keys = list(res['output'][attr].keys()) vals = list(res['output'][attr].values()) if attr == 'gender': category_desc = 'more masculine\nmore feminine' elif attr == 'age': category_desc = 'eighteen or younger\ntwenties\nthirties\nover forty' elif attr == 'org': attr = 'organization' category_desc = 'less like an organization\nmore like an organization' keys[0] = 'non-organization' keys[1] = 'organization' else: raise Exception("Unexpected M3 category %s" % attr) score = max(vals) if score < 0.6: conf_str = 'low' elif 0.6 <= score <= .85: conf_str = 'moderate' else: conf_str = 'high' p = Profile(username=username, attr_name=attr, classifier='m3', predicted_class=keys[vals.index(max(vals))], class_confidence=max(vals), confidence_str=conf_str, attr_categories='\n'.join(keys), attr_values='\n'.join([str(x) for x in vals]), category_desc=category_desc, explanations='', global_explanations='', is_current=True, is_sensitive=False) p.save() prfs.append(p) # Return profiles return tuple(prfs)
def __init__(self, pic_dir): self.m3twitter = M3Twitter(cache_dir=pic_dir)
""" def clean_tweet(self, tweet): return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])| (\w+:\/\/\S+)", " ", tweet).split()) def analyze_sentiment(self, tweet): analysis = TextBlob(self.clean_tweet(tweet)) if analysis.sentiment.polarity > 0: return 1 elif analysis.sentiment.polarity == 0: return 0 else: return -1 if __name__ == '__main__': PATH = "/Users/neilyeung/Desktop/DM_Final_Proj/data/testing/" final_df = pd.read_json(PATH + "2020-04-1_clean-dataset.jsonl", lines=true) # Polarity sent = Sentiment() final_df['sentiment'] = np.array([sent.analyze_sentiment(tweet) for tweet in final_df['full_text']]) # M3Inference m3twitter = M3Twitter(cache_dir="twitter_cache") m3twitter.transform_jsonl(input_file = "test/twitter_cache/2020-04-1_clean-dataset.jsonl",output_file = "test/twitter_cache/m3_input.jsonl") m3twitter.infer("test/twitter_cache/m3_input.jsonl")) m3_df = pd.read_json(PATH + "test/twitter_cache/m3_input.jsonl", lines=true) # TO DO: Test how the data frame looks on
parser.add_argument( '--skip-cache', dest='skip_cache', action='store_true', help= 'By default all requests are cached to the local filesystem and not refetched. Include this flag to disable/overwrite any results already in the cache.' ) parser.add_argument('--skip_logging', action='store_true', required=False, help='(Optional) Skip logging info if set.') parser.set_defaults(skip_cache=False) args = parser.parse_args() if (args.id == None) == (args.screen_name == None): # User specified both id and screen-name or neither one. sys.stderr.write("Exactly ONE of id or screen-name is required\n") parser.print_help(sys.stderr) quit(1) m3Twitter = M3Twitter(skip_logging=args.skip_logging) m3Twitter.twitter_init_from_file(args.auth) if args.id != None: pprint.pprint(m3Twitter.infer_id(args.id, skip_cache=args.skip_cache)) else: pprint.pprint( m3Twitter.infer_screen_name(args.screen_name, skip_cache=args.skip_cache))
# GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with m3webdemo. If not, see <https://www.gnu.org/licenses/>. from flask import Flask, url_for, render_template, Response, request from m3inference import M3Twitter import json import glob import urllib import os app = Flask(__name__) m3twitter = M3Twitter(cache_dir="static/m3/", model_dir="./") m3twitter.twitter_init(api_key=os.environ["api_key"], api_secret=os.environ["api_secret"], access_token=os.environ["access_token"], access_secret=os.environ["access_secret"]) screen_name_list = [ x.replace(".json", "").replace("static/m3/", "") for x in glob.glob("static/m3/*.json") ] @app.route('/') def index(): return render_template("index.html")
def main(dump: io.TextIOWrapper, basename: str, args: argparse.Namespace, shared: dict) -> None: """Main function that parses the arguments and writes the output.""" stats = { 'performance': { 'start_preprocess': None, 'end_preprocess': None, 'start_infer': None, 'end_infer': None, 'input': { 'users': 0, 'to_infer': 0, 'img_errors': 0 }, }, } stats['performance']['start_preprocess'] = datetime.datetime.utcnow() cache_dir = f"{args.output_dir_path}/twitter_cache_{args.cache_dir}" m3twitter = M3Twitter(cache_dir=cache_dir, use_full_model=True) # process the dump res = process_lines(dump, stats=stats, args=args, shared=shared, m3twitter=m3twitter) m3_input_file = f"{cache_dir}/m3_input.jsonl" output = fu.output_writer(path=m3_input_file, compression=None, mode='wt') for obj in res: # handle error while downloading an image if not os.path.exists(obj['img_path']): obj['img_path'] = TW_DEFAULT_PROFILE_IMG stats['performance']['input']['img_errors'] += 1 output.write(json.dumps(obj)) output.write("\n") output.close() stats['performance']['end_preprocess'] = datetime.datetime.utcnow() stats['performance']['start_infer'] = datetime.datetime.utcnow() inferred_users = m3twitter.infer(m3_input_file) stats_output = open(os.devnull, 'wt') output = open(os.devnull, 'wt') if not args.dry_run: # extract useful info from the name path_list = re.split('-|\.', basename) lang = path_list[0] stats_path = f"{args.output_dir_path}/infer-users/stats/{lang}" Path(stats_path).mkdir(parents=True, exist_ok=True) varname = ('{basename}-{pid}.{func}'.format(basename=basename, pid=os.getpid(), func='infer-users')) stats_filename = f"{stats_path}/{varname}.stats.xml" stats_output = fu.output_writer(path=stats_filename, compression=args.output_compression, mode='wt') file_path = f"{args.output_dir_path}/infer-users" Path(file_path).mkdir(parents=True, exist_ok=True) output_filename = f"{file_path}/{lang}-users-inference-{os.getpid()}.csv" output = fu.output_writer(path=output_filename, compression=args.output_compression, mode='wt') utils.log('Writing the results...') writer = csv.DictWriter(output, fieldnames=fieldnames) writer.writeheader() for user in inferred_users: if user in shared: user_dict = shared[user] inferred_user_stats = inferred_users[user] inferred_gender = inferred_user_stats['gender'] if inferred_gender['female'] >= inferred_gender['male']: user_dict['gender'] = 'female' user_dict['gender_acc'] = inferred_gender['female'] else: user_dict['gender'] = 'male' user_dict['gender_acc'] = inferred_gender['male'] inferred_age = inferred_user_stats['age'] for age, accuracy in inferred_age.items(): user_dict[f'age_{age}_acc'] = accuracy if inferred_age['>=40'] >= 1 - inferred_age['>=40']: user_dict['age'] = '>=40' user_dict['age_acc'] = inferred_age['>=40'] else: user_dict['age'] = '<40' user_dict['age_acc'] = 1 - inferred_age['>=40'] inferred_org = inferred_user_stats['org'] if inferred_org['is-org'] >= inferred_org['non-org']: user_dict['org'] = True user_dict['org_acc'] = inferred_org['is-org'] else: user_dict['org'] = False user_dict['org_acc'] = inferred_org['non-org'] writer.writerow(user_dict) output.close() utils.log('Finished to write results') stats['performance']['end_infer'] = datetime.datetime.utcnow() with stats_output: dumper.render_template( stats_template, stats_output, stats=stats, ) stats_output.close() if args.delete: try: utils.log("Deleting cache directory") shutil.rmtree(cache_dir) except OSError as e: utils.log(f"Error: {e.filename} - {e.strerror}.")
'--screen-name', help= 'The screen_name of a Twitter user (i.e., everything following the @, but do not include @ itself)' ) # parser.add_argument('--skip-cache', type=bool, nargs='?',const=True, default=False,help='By default all requests are cached to the local filesystem and not refetched. Include this flag to disable/overwrite any results already in the cache.') parser.add_argument( '--skip-cache', dest='skip_cache', action='store_true', help= 'By default all requests are cached to the local filesystem and not refetched. Include this flag to disable/overwrite any results already in the cache.' ) parser.set_defaults(skip_cache=False) args = parser.parse_args() if (args.id == None) == (args.screen_name == None): # User specified both id and screen-name or neither one. sys.stderr.write("Exactly ONE of id or screen-name is required\n") parser.print_help(sys.stderr) quit(1) m3Twitter = M3Twitter() m3Twitter.twitter_init_from_file(args.auth) if args.id != None: pprint.pprint(m3Twitter.infer_id(args.id, skip_cache=args.skip_cache)) else: pprint.pprint( m3Twitter.infer_screen_name(args.screen_name, skip_cache=args.skip_cache))
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag pattern = re.compile(r"(.)\1{2,}") tweet = pattern.sub(r"\1\1", tweet) tweet = word_tokenize(tweet) misspelled = spell.unknown(tweet) for word in misspelled: tweet = [spell.correction((word)) if x == word else x for x in tweet] String= '' for item in tweet: String = String+item+" " return String time1 = time.time() m3twitter=M3Twitter(cache_dir='twitter cache') #Change the cache_dir parameter to control where profile images are downloaded analyzer = SentimentIntensityAnalyzer() from m3inference import M3Inference import pprint Columns = ['Tweet_id',"name",'id','location','text','sentiment','age','gender','date','time','race','score','screen_name'] df= pd.DataFrame(columns= Columns) m3 = M3Inference() with jsonlines.open('/Users/jonathanlai/Downloads/501.jsonl') as f: count = 0 listx = [] for line in f.iter():