Ejemplo n.º 1
0
def process_lines(dump: io.TextIOWrapper, stats: Mapping,
                  args: argparse.Namespace, shared: dict,
                  m3twitter: M3Twitter) -> Iterator[list]:
    """It checks for each line (user) if the number of tweets is above a certain minimum and if
       it is the case, it uses transform_jsonl_object to get a particular dict to perform 
       inference later on
    """

    for user in dump:
        stats['performance']['input']['users'] += 1
        # trasform the object only if the user reached the minimum number of tweets
        if 'tweets' in user and int(user['tweets']) >= args.min_tweets:
            stats['performance']['input']['to_infer'] += 1
            shared[user['id_str']] = init_user(user)
            # handle empty profile_image_url_https
            if user['profile_image_url_https'] == "":
                user['default_profile_image'] = True
                stats['performance']['input']['img_errors'] += 1
            yield m3twitter.transform_jsonl_object(user)
Ejemplo n.º 2
0
def build_m3(username):
    # Gather info
    m3twitter = M3Twitter()
    res = m3twitter.infer_screen_name(username)

    # Separate profile elements
    prfs = []
    for attr in res['output']:
        keys = list(res['output'][attr].keys())
        vals = list(res['output'][attr].values())

        if attr == 'gender':
            category_desc = 'more masculine\nmore feminine'
        elif attr == 'age':
            category_desc = 'eighteen or younger\ntwenties\nthirties\nover forty'
        elif attr == 'org':
            attr = 'organization'
            category_desc = 'less like an organization\nmore like an organization'
            keys[0] = 'non-organization'
            keys[1] = 'organization'
        else:
            raise Exception("Unexpected M3 category %s" % attr)

        score = max(vals)
        if score < 0.6:
            conf_str = 'low'
        elif 0.6 <= score <= .85:
            conf_str = 'moderate'
        else:
            conf_str = 'high'

        p = Profile(username=username, attr_name=attr, classifier='m3', predicted_class=keys[vals.index(max(vals))],
                    class_confidence=max(vals), confidence_str=conf_str, attr_categories='\n'.join(keys),
                    attr_values='\n'.join([str(x) for x in vals]), category_desc=category_desc,
                    explanations='', global_explanations='', is_current=True, is_sensitive=False)
        p.save()
        prfs.append(p)

    # Return profiles
    return tuple(prfs)
Ejemplo n.º 3
0
 def __init__(self, pic_dir):
     self.m3twitter = M3Twitter(cache_dir=pic_dir)
Ejemplo n.º 4
0
    """
    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])| (\w+:\/\/\S+)", " ", tweet).split())

    def analyze_sentiment(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))

        if analysis.sentiment.polarity > 0:
            return 1
        elif analysis.sentiment.polarity == 0:
            return 0
        else:
            return -1

if __name__ == '__main__':
    PATH = "/Users/neilyeung/Desktop/DM_Final_Proj/data/testing/"
    final_df = pd.read_json(PATH + "2020-04-1_clean-dataset.jsonl", lines=true)

    # Polarity
    sent = Sentiment()
    final_df['sentiment'] = np.array([sent.analyze_sentiment(tweet) for tweet in final_df['full_text']])

    # M3Inference
    m3twitter = M3Twitter(cache_dir="twitter_cache")
    m3twitter.transform_jsonl(input_file = "test/twitter_cache/2020-04-1_clean-dataset.jsonl",output_file = "test/twitter_cache/m3_input.jsonl")
    m3twitter.infer("test/twitter_cache/m3_input.jsonl"))

    m3_df = pd.read_json(PATH + "test/twitter_cache/m3_input.jsonl", lines=true)

    # TO DO: Test how the data frame looks on
Ejemplo n.º 5
0
    parser.add_argument(
        '--skip-cache',
        dest='skip_cache',
        action='store_true',
        help=
        'By default all requests are cached to the local filesystem and not refetched. Include this flag to disable/overwrite any results already in the cache.'
    )
    parser.add_argument('--skip_logging',
                        action='store_true',
                        required=False,
                        help='(Optional) Skip logging info if set.')

    parser.set_defaults(skip_cache=False)
    args = parser.parse_args()
    if (args.id == None) == (args.screen_name == None):
        # User specified both id and screen-name or neither one.
        sys.stderr.write("Exactly ONE of id or screen-name is required\n")
        parser.print_help(sys.stderr)
        quit(1)

    m3Twitter = M3Twitter(skip_logging=args.skip_logging)

    m3Twitter.twitter_init_from_file(args.auth)

    if args.id != None:
        pprint.pprint(m3Twitter.infer_id(args.id, skip_cache=args.skip_cache))
    else:
        pprint.pprint(
            m3Twitter.infer_screen_name(args.screen_name,
                                        skip_cache=args.skip_cache))
Ejemplo n.º 6
0
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with m3webdemo.  If not, see <https://www.gnu.org/licenses/>.

from flask import Flask, url_for, render_template, Response, request

from m3inference import M3Twitter
import json
import glob
import urllib
import os

app = Flask(__name__)

m3twitter = M3Twitter(cache_dir="static/m3/", model_dir="./")
m3twitter.twitter_init(api_key=os.environ["api_key"],
                       api_secret=os.environ["api_secret"],
                       access_token=os.environ["access_token"],
                       access_secret=os.environ["access_secret"])
screen_name_list = [
    x.replace(".json", "").replace("static/m3/", "")
    for x in glob.glob("static/m3/*.json")
]


@app.route('/')
def index():
    return render_template("index.html")

Ejemplo n.º 7
0
def main(dump: io.TextIOWrapper, basename: str, args: argparse.Namespace,
         shared: dict) -> None:
    """Main function that parses the arguments and writes the output."""
    stats = {
        'performance': {
            'start_preprocess': None,
            'end_preprocess': None,
            'start_infer': None,
            'end_infer': None,
            'input': {
                'users': 0,
                'to_infer': 0,
                'img_errors': 0
            },
        },
    }

    stats['performance']['start_preprocess'] = datetime.datetime.utcnow()
    cache_dir = f"{args.output_dir_path}/twitter_cache_{args.cache_dir}"
    m3twitter = M3Twitter(cache_dir=cache_dir, use_full_model=True)

    # process the dump
    res = process_lines(dump,
                        stats=stats,
                        args=args,
                        shared=shared,
                        m3twitter=m3twitter)

    m3_input_file = f"{cache_dir}/m3_input.jsonl"
    output = fu.output_writer(path=m3_input_file, compression=None, mode='wt')

    for obj in res:
        # handle error while downloading an image
        if not os.path.exists(obj['img_path']):
            obj['img_path'] = TW_DEFAULT_PROFILE_IMG
            stats['performance']['input']['img_errors'] += 1
        output.write(json.dumps(obj))
        output.write("\n")

    output.close()

    stats['performance']['end_preprocess'] = datetime.datetime.utcnow()

    stats['performance']['start_infer'] = datetime.datetime.utcnow()

    inferred_users = m3twitter.infer(m3_input_file)

    stats_output = open(os.devnull, 'wt')
    output = open(os.devnull, 'wt')
    if not args.dry_run:
        # extract useful info from the name
        path_list = re.split('-|\.', basename)
        lang = path_list[0]

        stats_path = f"{args.output_dir_path}/infer-users/stats/{lang}"
        Path(stats_path).mkdir(parents=True, exist_ok=True)
        varname = ('{basename}-{pid}.{func}'.format(basename=basename,
                                                    pid=os.getpid(),
                                                    func='infer-users'))
        stats_filename = f"{stats_path}/{varname}.stats.xml"

        stats_output = fu.output_writer(path=stats_filename,
                                        compression=args.output_compression,
                                        mode='wt')

        file_path = f"{args.output_dir_path}/infer-users"
        Path(file_path).mkdir(parents=True, exist_ok=True)

        output_filename = f"{file_path}/{lang}-users-inference-{os.getpid()}.csv"

        output = fu.output_writer(path=output_filename,
                                  compression=args.output_compression,
                                  mode='wt')

    utils.log('Writing the results...')

    writer = csv.DictWriter(output, fieldnames=fieldnames)
    writer.writeheader()
    for user in inferred_users:
        if user in shared:
            user_dict = shared[user]
            inferred_user_stats = inferred_users[user]

            inferred_gender = inferred_user_stats['gender']
            if inferred_gender['female'] >= inferred_gender['male']:
                user_dict['gender'] = 'female'
                user_dict['gender_acc'] = inferred_gender['female']
            else:
                user_dict['gender'] = 'male'
                user_dict['gender_acc'] = inferred_gender['male']

            inferred_age = inferred_user_stats['age']

            for age, accuracy in inferred_age.items():
                user_dict[f'age_{age}_acc'] = accuracy

            if inferred_age['>=40'] >= 1 - inferred_age['>=40']:
                user_dict['age'] = '>=40'
                user_dict['age_acc'] = inferred_age['>=40']
            else:
                user_dict['age'] = '<40'
                user_dict['age_acc'] = 1 - inferred_age['>=40']

            inferred_org = inferred_user_stats['org']
            if inferred_org['is-org'] >= inferred_org['non-org']:
                user_dict['org'] = True
                user_dict['org_acc'] = inferred_org['is-org']
            else:
                user_dict['org'] = False
                user_dict['org_acc'] = inferred_org['non-org']

            writer.writerow(user_dict)

    output.close()

    utils.log('Finished to write results')

    stats['performance']['end_infer'] = datetime.datetime.utcnow()

    with stats_output:
        dumper.render_template(
            stats_template,
            stats_output,
            stats=stats,
        )

    stats_output.close()

    if args.delete:
        try:
            utils.log("Deleting cache directory")
            shutil.rmtree(cache_dir)
        except OSError as e:
            utils.log(f"Error: {e.filename} - {e.strerror}.")
Ejemplo n.º 8
0
        '--screen-name',
        help=
        'The screen_name of a Twitter user (i.e., everything following the @, but do not include @ itself)'
    )
    # parser.add_argument('--skip-cache', type=bool, nargs='?',const=True, default=False,help='By default all requests are cached to the local filesystem and not refetched. Include this flag to disable/overwrite any results already in the cache.')
    parser.add_argument(
        '--skip-cache',
        dest='skip_cache',
        action='store_true',
        help=
        'By default all requests are cached to the local filesystem and not refetched. Include this flag to disable/overwrite any results already in the cache.'
    )
    parser.set_defaults(skip_cache=False)
    args = parser.parse_args()
    if (args.id == None) == (args.screen_name == None):
        # User specified both id and screen-name or neither one.
        sys.stderr.write("Exactly ONE of id or screen-name is required\n")
        parser.print_help(sys.stderr)
        quit(1)

    m3Twitter = M3Twitter()

    m3Twitter.twitter_init_from_file(args.auth)

    if args.id != None:
        pprint.pprint(m3Twitter.infer_id(args.id, skip_cache=args.skip_cache))
    else:
        pprint.pprint(
            m3Twitter.infer_screen_name(args.screen_name,
                                        skip_cache=args.skip_cache))
Ejemplo n.º 9
0
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
    pattern = re.compile(r"(.)\1{2,}")
    tweet = pattern.sub(r"\1\1", tweet)
    tweet = word_tokenize(tweet)

    misspelled = spell.unknown(tweet)

    for word in misspelled:
        tweet = [spell.correction((word)) if x == word else x for x in tweet]
    String= ''
    for item in tweet:
        String = String+item+" "
    return String

time1 = time.time()
m3twitter=M3Twitter(cache_dir='twitter cache') #Change the cache_dir parameter to control where profile images are downloaded
analyzer = SentimentIntensityAnalyzer()
from m3inference import M3Inference
import pprint
Columns = ['Tweet_id',"name",'id','location','text','sentiment','age','gender','date','time','race','score','screen_name']
df= pd.DataFrame(columns= Columns)


m3 = M3Inference()


with jsonlines.open('/Users/jonathanlai/Downloads/501.jsonl') as f:
    count = 0
    listx = []

    for line in f.iter():