コード例 #1
0
def main(remoteSavePath):

    output = {}

    for file in listdir('results'):
        if isfile(join('results', file)):
            s3.upload('results', remoteSavePath, file)

            if file == 'config.json':
                output['config'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'div.html':
                output['visualization'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'AutoPhrase_multi-words.txt':
                output['multi-words'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'AutoPhrase_single-word.txt':
                output['single-word'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'AutoPhrase.txt':
                output['autophrase'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'segmentation.model':
                output['model'] = s3.generate_downloads(remoteSavePath, file)
            elif file == 'token_mapping.txt':
                output['token-mapping'] = s3.generate_downloads(remoteSavePath, file)
            else:
                output['misc'] = s3.generate_downloads(remoteSavePath, file)

    return output
コード例 #2
0
def lambda_handler(event, context):

    output = dict()

    uid = event['uid']
    awsPath = event['s3FolderName'] + '/ML/classification/' + uid + '/'
    localSavePath = '/tmp/' + event[
        's3FolderName'] + '/ML/classification/' + uid + '/'
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)

    # download config to local folder
    fname_config = 'config.json'
    try:
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in data.keys():
                if key not in event.keys():
                    event[key] = data[key]
        with open(localSavePath + fname_config, "w") as f:
            json.dump(event, f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uid'] = uid

    except:
        raise ValueError('This session ID is invalid!')
        exit()

    # download unlabeled data to local folder
    fname_unlabeled = 'testing.csv'
    try:
        s3.downloadToDisk(fname_unlabeled, localSavePath, awsPath)
    except:
        raise ValueError('You\'re requesting ' + fname_unlabeled +
                         ' file, and it\'s not found in your remote directory!\
            It is likely that you have not yet performed step 1 -- split the dataset into training and predicting set, or you have provided the wrong sessionID.'
                         )
        exit()

    #download pickle model to local folder
    fname_pickle = 'pipeline.pickle'
    try:
        s3.downloadToDisk(fname_pickle, localSavePath, awsPath)
    except:
        raise ValueError(
            'You\'re requesting ' + fname_pickle +
            ' file, and it\'s not found in your remote directory! \
            It is likely that you have not yet performed step 2 -- model training, or you have provided the wrong sessionID.'
        )
        exit()

    classification = Classification(awsPath, localSavePath)
    output['predicting'] = classification.predict()
    output['div_category'] = classification.plot()

    return output
コード例 #3
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'crimson')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    today = date.today()
    yesterday = today - timedelta(days=1)
    dayBeforeYesterday = today - timedelta(days=2)
    fname = collect_crimson_monitor_data(
        dayBeforeYesterday.strftime("%Y-%m-%d"),
        yesterday.strftime("%Y-%m-%d"), localPath)

    s3.upload("macroscope-paho-covid", localPath, "input/crimson", fname)

    return None
コード例 #4
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'sentiment')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # collect top sources and plot
    today = date.today()
    yesterday = today - timedelta(days=1)
    dayBeforeYesterday = today - timedelta(days=2)
    fnames = crimson_sentiment(dayBeforeYesterday.strftime("%Y-%m-%d"),
                               yesterday.strftime("%Y-%m-%d"), localPath)
    for fname in fnames:
        s3.upload("macroscope-paho-covid", localPath, "sentiment", fname)

    return None
コード例 #5
0
def save_remote_output(localSavePath, remoteSavePath, fname):
    """

    :param localSavePath:
    :param remoteSavePath:
    :param fname:
    :param output_data:
    :return:
    """
    zipf = zipfile.ZipFile(os.path.join(localSavePath, fname), 'w',
                           zipfile.ZIP_DEFLATED)
    zipdir(os.path.join(localSavePath, 'img'), zipf)
    zipf.close()

    s3.upload(localSavePath, remoteSavePath, fname)
    url = s3.generate_downloads(remoteSavePath, fname)

    return url
コード例 #6
0
    def plot(self):
        y_pred_dict = Counter(self.predicted)
        labels = []
        values = []
        for i in y_pred_dict.keys():
            labels.append("class: " + str(i))
            values.append(y_pred_dict[i])
        trace = go.Pie(labels=labels, values=values, textinfo='label')
        div_category = plot([trace],
                            output_type='div',
                            image='png',
                            auto_open=False,
                            image_filename='plot_img')

        fname_div_category = 'div_category.html'
        with open(self.localSavePath + fname_div_category, "w") as f:
            f.write(div_category)
        s3.upload(self.localSavePath, self.awsPath, fname_div_category)
        return s3.generate_downloads(self.awsPath, fname_div_category)
コード例 #7
0
def lambda_handler(event, context):

    awsPath = os.path.join(event['sessionID'], event['screen_name'])
    localPath = os.path.join('/tmp', event['sessionID'], event['screen_name'])
    if not os.path.exists(localPath):
        os.makedirs(localPath)
    screen_name = event['screen_name']

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    with open(os.path.join(localPath, screen_name + '_tweets.txt'),
              'r') as personality_text:
        headers = {'Content-Type': 'text/plain', 'Accept': 'application/json'}

        # concatenate the text field to be a paragraph
        df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt'))
        tweets = df['text'].tolist()
        body = '. '.join(tweets).encode('utf-8', 'ignore')

        r = requests.post(
            'https://gateway.watsonplatform.net/personality-insights/api/v3/profile?version=2017-10-13&consumption_preferences=true&raw_scores=true',
            headers=headers,
            data=body,
            auth=('apikey', event['apikey']),
            timeout=300)

        if r.status_code == 200:
            data = {'personality': r.json()}

            with open(
                    os.path.join(localPath,
                                 screen_name + '_personality' + '.json'),
                    'w') as outfile:
                json.dump(data, outfile)

            s3.upload(localPath, awsPath, screen_name + '_personality.json')

            return data
        else:
            raise ValueError(r.text)
    def metrics(self):
        report = np.array(metrics.precision_recall_fscore_support(self.target,self.predicted,labels=self.labels)).T
        avg_report = list(metrics.precision_recall_fscore_support(self.target,self.predicted,average='weighted'))
        avg_report.insert(0,'AVG')

        # save metrics report
        fname_metrics = 'classification_report.csv'
        with open(self.localSavePath + fname_metrics,'w',newline="") as f:
            writer = csv.writer(f)
            writer.writerow(['label','precision','recall','f1-score','support'])
            for i in range(len(report)):
                writer.writerow([self.labels[i],
                                    round(report[i][0],4),
                                    round(report[i][1],4),
                                    round(report[i][2],4),
                                    round(report[i][3],4)])
            writer.writerow(avg_report)
        s3.upload(self.localSavePath, self.awsPath, fname_metrics)
        return {'metrics': s3.generate_downloads(self.awsPath, fname_metrics)}
コード例 #9
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'hashtag')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # download triggered file
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = unquote_plus(event['Records'][0]['s3']['object']['key'])
    remotePath = "/".join(key.split("/")[:-1])
    filename = key.split("/")[-1]
    s3.downloadToDisk(bucket, filename, localPath, remotePath)

    # load to dataframe
    df = pd.read_csv(os.path.join(localPath, filename))

    # extract hashtag
    hash = extract_hashtag(df)

    # plot bar chart (frequency chart)
    index = hash['hashtags'].values.tolist()[:10]
    counts = hash['Freq'].values.tolist()[:10]
    title = 'Top 10 prevalent hashtags (' + filename.split(".")[0] + ')'
    div = plot.plot_bar_chart(index, counts, title)

    # save result and write back to s3
    hash_filename = filename.split(".")[0]

    hash.to_csv(os.path.join(localPath,
                             hash_filename + "_extracted_hashtag.csv"),
                index=False)
    s3.upload("macroscope-paho-covid", localPath, "hashtags",
              hash_filename + "_extracted_hashtag.csv")

    with open(
            os.path.join(localPath,
                         hash_filename + "_extracted_hashtag_frequency.html"),
            'w') as f:
        f.write(div)
    s3.upload("macroscope-paho-covid", localPath, "hashtags",
              hash_filename + "_extracted_hashtag_frequency.html")

    return None
コード例 #10
0
def lambda_handler(event, context):

    awsPath = os.path.join(event['sessionID'], event['screen_name'])
    localSavePath = os.path.join('/tmp', event['sessionID'],
                                 event['screen_name'])
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)

    auth = tweepy.OAuthHandler(event['consumer_key'], event['consumer_secret'])
    auth.set_access_token(event['access_token'], event['access_token_secret'])
    api = tweepy.API(auth)

    tweets = []
    for status in tweepy.Cursor(api.user_timeline,
                                screen_name=event['screen_name'],
                                count=100,
                                tweet_mode="extended").items():
        tweets.append([
            status._json['id'],
            status._json['full_text'].encode('utf-8', 'ignore').decode()
        ])

    if len(tweets) > 0:
        fname = event['screen_name'] + '_tweets.txt'
        with open(os.path.join(localSavePath, fname),
                  'w',
                  encoding='utf-8',
                  newline='') as f:
            header = ['id', 'text']
            writer = csv.writer(f, delimiter=",")
            writer.writerow(header)
            for row in tweets:
                writer.writerow(row)

        s3.upload(localSavePath, awsPath, fname)

        return {'url': s3.generate_downloads(awsPath, fname)}
    else:
        raise ValueError('This user\'s timeline (screen_name: ' +
                         event['screen_name'] +
                         ') is empty. There is nothing to analyze!')
コード例 #11
0
def related_queries(keywords, language, localPath):
    if language.lower() == 'spanish':
        pytrend = TrendReq(hl='sp-SP')
    else:
        pytrend = TrendReq()

    timeframes = {'now 1-d': '1day', 'now 7-d': '7days', 'today 1-m': '30days'}

    # there is a limit on 100 characters for keywords break them to multiple requests then
    while len(keywords) > 0:
        character_len = 0
        keywords_split = []
        for kk in keywords:
            character_len += len(kk)
            if character_len < 50:
                keywords_split.append(kk)
        for item in keywords_split:
            keywords.remove(item)

        indices = {}
        counts = {}
        title = {}
        subtitles = {}

        for timeframe in timeframes.keys():
            pytrend.build_payload(kw_list=keywords_split, timeframe=timeframe)
            df_queries = pytrend.related_queries()

            for keyword in keywords_split:

                if keyword not in indices.keys():
                    indices[keyword] = []
                if keyword not in counts.keys():
                    counts[keyword] = []
                if keyword not in subtitles.keys():
                    subtitles[keyword] = []

                df_top = df_queries[keyword]['top']
                df_rising = df_queries[keyword]['rising']

                if df_top is not None and df_rising is not None:
                    # plot bar chart side by side
                    indices[keyword].append([df_top["query"].tolist()[:10], df_rising["query"].tolist()[:10]])
                    counts[keyword].append([df_top["value"].tolist()[:10], df_rising["value"].tolist()[:10]])
                    title[keyword] = "Google Trends Queries related to keyword: " + keyword
                    subtitles[keyword].append(["top related query(" + timeframes[timeframe] + ")",
                                               "rising related query(" + timeframes[timeframe] + ")"])

                    # save csv
                    df_top.rename(columns={'query': 'top related query'}, inplace=True)
                    df_rising.rename(columns={'query': 'rising related query'}, inplace=True)
                    result = pd.concat([df_top, df_rising], axis=1)
                    result.to_csv(os.path.join(localPath, keyword.replace(" ", "_") + "_" + timeframes[timeframe] +
                                               "_related_queries.csv"), index=False)
                    s3.upload("macroscope-paho-covid", localPath, "related_queries",
                              keyword.replace(" ", "_") + "_" + timeframes[timeframe] +
                              "_related_queries.csv")

        for keyword in keywords_split:
            div = plot.plot_multiple_bar_chart(indices[keyword], counts[keyword], title[keyword], subtitles[keyword])
            with open(os.path.join(localPath, keyword.replace(" ", "_") + "_related_queries.html"), 'w') as f:
                f.write(div)
            s3.upload("macroscope-paho-covid", localPath, "related_queries",
                      keyword.replace(" ", "_") + "_related_queries.html")

    return None
コード例 #12
0
    def predict(self):

        # load classification model
        pkl_model = os.path.join(self.localSavePath, 'pipeline.pickle')
        with open(pkl_model, 'rb') as f:
            text_clf = pickle.load(f)

        # load text set
        data = []
        try:
            with open(self.localSavePath + 'testing.csv',
                      'r',
                      encoding='utf-8',
                      errors="ignore") as f:
                reader = list(csv.reader(f))
                for row in reader[1:]:
                    try:
                        data.extend(row)
                    except Exception as e:
                        pass
        except:
            with open(self.localSavePath + 'testing.csv',
                      'r',
                      encoding='ISO-8859-1',
                      errors="ignore") as f:
                reader = list(csv.reader(f))
                for row in reader[1:]:
                    try:
                        data.extend(row)
                    except Exception as e:
                        pass

        # predict using trained model
        self.predicted = text_clf.predict(data)

        # save result
        fname = 'predicting.csv'
        try:
            with open(self.localSavePath + fname,
                      'w',
                      newline="",
                      encoding='utf-8',
                      errors="ignore") as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for i in range(len(data)):
                    try:
                        writer.writerow([data[i], self.predicted[i]])
                    except:
                        pass
        except:
            with open(self.localSavePath + fname,
                      'w',
                      newline="",
                      encoding='ISO-8859-1',
                      errors="ignore") as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for i in range(len(data)):
                    try:
                        writer.writerow([data[i], self.predicted[i]])
                    except:
                        pass
        s3.upload(self.localSavePath, self.awsPath, fname)
        return s3.generate_downloads(self.awsPath, fname)
コード例 #13
0
    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    # calculate brand personality
    model = MultiLabelClassificationModel('roberta',
                                          'checkpoint-17315-epoch-5',
                                          num_labels=5,
                                          args={
                                              "reprocess_input_data": True,
                                              'use_cached_eval_features': False
                                          },
                                          use_cuda=False)
    df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt'))
    new_df = multiple_sentences(df, model)
    fname_sentences = screen_name + '_utku_personality_sentences.csv'
    new_df.to_csv(os.path.join(localPath, fname_sentences), index=False)
    s3.upload(localPath, awsPath, fname_sentences)

    # get the average score
    mean_metrics = average(new_df)
    fname_average = screen_name + '_utku_personality_average.json'
    with open(os.path.join(localPath, fname_average), 'w') as f:
        json.dump(mean_metrics, f)
    s3.upload(localPath, awsPath, fname_average)

    # push notification email
    notification(toaddr=params['email'], sessionURL=params['sessionURL'])
コード例 #14
0
def lambda_handler(event, context):

    screen_names = [
        "msalnacion", "gisbarbados", "MFABelize", "MinSaludBolivia",
        "minsaude", "GovCanHealth", "caymangovt", "ministeriosalud",
        "MinSaludCol", "msaludcr", "GoDomRep", "Salud_Ec", "ars_guyane",
        "MsppOfficiel", "themohwgovjm", "GobiernoMX", "msaludpy",
        "PeruPaisDigital", "skngov", "MOH_TT", "MSPUruguay", "USAGov"
    ]

    # create local path
    localPath = os.path.join('/tmp', 'tweets')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # collect timeline
    auth = tweepy.OAuthHandler(os.environ['consumer_key'],
                               os.environ['consumer_secret'])
    auth.set_access_token(os.environ['access_token'],
                          os.environ['access_token_secret'])
    api = tweepy.API(auth)

    header = [
        "created_at", "id", "id_str", "full_text", "truncated",
        "display_text_range", "source", "in_reply_to_status_id",
        "in_reply_to_status_id_str", "in_reply_to_user_id",
        "in_reply_to_user_id_str", "in_reply_to_screen_name",
        "is_quote_status", "retweet_count", "favorite_count", "favorited",
        "retweeted", "possibly_sensitive", "lang"
    ]

    for screen_name in screen_names:

        tweets = []
        for status in tweepy.Cursor(api.user_timeline,
                                    screen_name=screen_name,
                                    count=200,
                                    tweet_mode="extended").items():
            if "created_at" in status._json.keys(
            ) and status._json["created_at"][-4:] == "2020":
                tweet = []
                for key in header:
                    if key in status._json.keys():
                        # make sure date

                        if key == 'full_text':
                            tweet.append(status._json[key].encode(
                                'utf-8', 'ignore').decode())
                        else:
                            tweet.append(status._json[key])
                    else:
                        tweet.append("NA")
                tweets.append(tweet)
            else:
                break

        if len(tweets) > 0:
            fname = screen_name + '_tweets.csv'
            with open(os.path.join(localPath, fname),
                      'w',
                      encoding='utf-8',
                      newline='') as f:
                writer = csv.writer(f, delimiter=",")
                writer.writerow(header)
                for row in tweets:
                    writer.writerow(row)

            s3.upload("macroscope-paho-covid", localPath, "input/twitter",
                      fname)

            time.sleep(2)

    return None
コード例 #15
0
def save_remote_output(localSavePath, remoteSavePath, fname, output_data):
    """
    save output in memory first to local file, then upload to remote S3 bucket
    :param localSavePath: local saved file
    :param remoteSavePath: remote save file path
    :param fname: filename
    :param output_data: the actual data
    :return: url of the file saved in S3 bucket
    """

    # json
    if isinstance(output_data, dict):
        fname += '.json'
        with open(os.path.join(localSavePath, fname), 'w') as f:
            json.dump(output_data, f)

    # # dataframe to csv
    # elif isinstance(output_data, pd.DataFrame):
    #     fname += '.csv'
    #     output_data.to_csv(fname)

    # string to html
    elif isinstance(output_data, str):
        fname += '.html'
        with open(os.path.join(localSavePath, fname), 'w') as f:
            f.write(output_data)

    # list(list) to csv
    elif isinstance(output_data, list) \
            and (isinstance(output_data[0], list) or isinstance(output_data[0],
                                                                tuple)):
        fname += '.csv'
        with open(os.path.join(localSavePath, fname),
                  'w',
                  newline='',
                  encoding='utf-8') as f:
            writer = csv.writer(f)
            for row in output_data:
                try:
                    writer.writerow(row)
                except UnicodeEncodeError as e:
                    print(e)

    # generator
    elif isinstance(output_data, types.GeneratorType):
        if fname == 'gephi':
            fname += '.gml'
        elif fname == 'pajek':
            fname += '.net'
        else:
            fname += '.unknown'

        with open(os.path.join(localSavePath, fname), 'w', newline='') as f:
            for line in output_data:
                f.write(line + '\n')

    # else pickle the object
    else:
        fname += '.pickle'
        with open(os.path.join(localSavePath, fname), 'wb') as f:
            pickle.dump(output_data, f)

    s3.upload(localSavePath, remoteSavePath, fname)
    url = s3.generate_downloads(remoteSavePath, fname)

    return url
コード例 #16
0
def calc_tweet_personality(sessionID, screen_name, profile_img):

    # load embedding dataset
    curr_path = os.path.dirname(os.path.abspath(__file__))

    dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec"
    wordDictionary = dsu.parseFastText(dataset_path)

    # load predictive models
    models = {}
    for trait in ["O", "C", "E", "A", "N"]:
        models[trait] = joblib.load(curr_path + "/models/model_" + trait +
                                    ".pkl")

    # read tweets
    awsPath = os.path.join(sessionID, screen_name)
    sessionDir = os.environ['SESSIONDIR']
    localPath = os.path.join(sessionDir + '/collection', sessionID)
    if not os.path.exists(localPath):
        try:
            os.makedirs(localPath)
        except:
            pass

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    # process the tweets
    tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt')
    filteredTweets = []
    word_count = 0
    for tweet in open(tweet_file_path, "r", encoding="utf-8"):
        if re.match(r'^(RT)', tweet) or tweet == '\n' \
                or tweet == '' or tweet == ' ':
            continue

        #remove links starting with "http"
        tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet)
        #remove links with no http (probably unnecessary)
        tweet = re.sub(
            r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)',
            " ", tweet)
        #remove mentions
        tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)',
                       " ", tweet)
        #hashtags are removed by countvectorizer
        filteredTweets.append(tweet)

        word_count += len(tweet.split())

        if len(filteredTweets) == 0:
            print("Not enough tweets for prediction.")
            continue

    #now we can process the tweet using embeddings.transofrmTextForTraining
    try:
        tweetEmbeddings = embeddings.transformTextForTesting(
            wordDictionary, 3, filteredTweets, "conc")
    except:
        print("Not enough tweets for prediction.")

    # predict using saved models
    # range is 0 ~ 5
    scores = {}
    for trait in ["O", "C", "E", "A", "N"]:
        model = models[trait]
        preds = model.predict(tweetEmbeddings)
        scores[trait] = float(str(np.mean(np.array(preds)))[0:5])

    jung = ""
    if scores["E"] > 3:
        jung = "E"
    else:
        jung = "I"
    if scores["O"] > 3:
        jung = jung + "N"
    else:
        jung = jung + "S"
    if scores["A"] > 3:
        jung = jung + "F"
    else:
        jung = jung + "T"
    if scores["C"] > 3:
        jung = jung + "J"
    else:
        jung = jung + "P"

    scores["jung"] = jung

    # sort the output
    result = {}
    result['screen_name'] = screen_name
    result['profile_img'] = profile_img
    result['personality'] = {
        "word_count":
        word_count,
        "processed_language":
        "en",
        'personality': [{
            'name': 'Openness',
            'percentile': scores['O'] / 5
        }, {
            'name': 'Conscientiousness',
            'percentile': scores['C'] / 5
        }, {
            'name': 'Extraversion',
            'percentile': scores['E'] / 5
        }, {
            'name': 'Agreeableness',
            'percentile': scores['A'] / 5
        }, {
            'name': 'Emotional range',
            'percentile': scores['N'] / 5
        }]
    }

    # save to json and upload to s3 bucket
    with open(os.path.join(localPath, screen_name + '_twitPersonality.json'),
              'w') as outfile:
        json.dump(result, outfile)
    s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json')

    # delete localPath files
    try:
        os.remove(os.path.join(localPath, screen_name + '_tweets.txt'))
        os.remove(
            os.path.join(localPath, screen_name + '_twitPersonality.json'))
    except:
        # already deleted!
        pass

    print(s3.generate_downloads(awsPath,
                                screen_name + '_twitPersonality.json'))

    return result
    def classify(self, model):

        if model == 'NaiveBayes':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',MultinomialNB())])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.predict_proba(self.data)
        elif model == 'Perceptron':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',Perceptron())])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.decision_function(self.data)
        elif model == 'SGD':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',SGDClassifier())])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.decision_function(self.data)
        elif model == 'RandomForest':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',RandomForestClassifier(n_estimators=100))])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.predict_proba(self.data)
        elif model == 'KNN':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',KNeighborsClassifier(n_neighbors=10))])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.predict_proba(self.data)
        elif model == 'passiveAggressive':
            text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',PassiveAggressiveClassifier(n_iter=50))])
            # 10 fold cross validation 
            self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10)
            # fit the model
            text_clf.fit(self.data, self.target)
            y_score = text_clf.decision_function(self.data)           
            
        # get 10 fold cross validation accuracy score
        fold_scores = cross_val_score(text_clf, self.data, self.target, cv=10)
        fname_folds = 'accuracy_score.csv'
        with open(self.localSavePath + fname_folds,'w',newline="") as f:
            writer = csv.writer(f)
            writer.writerow(['fold_1','fold_2','fold_3','fold_4','fold_5',
                             'fold_6','fold_7','fold_8','fold_9','fold_10'])
            writer.writerow([ '%.4f' % elem for elem in fold_scores ])
        s3.upload(self.localSavePath, self.awsPath, fname_folds)
        accuracy_url = s3.generate_downloads(self.awsPath, fname_folds)
        
        # pickle the Pipeline for future use
        fname_pickle = 'classification_pipeline.pickle'
        with open(self.localSavePath + fname_pickle,'wb') as f:
            pickle.dump(text_clf,f)
        s3.upload(self.localSavePath, self.awsPath, fname_pickle)
        pickle_url = s3.generate_downloads(self.awsPath, fname_pickle)

        # plotting the roc curve
        self.labels = text_clf.classes_       
        y = label_binarize(self.target,classes = self.labels)

        
        # binary class
        if len(self.labels) <= 2:
            if model == 'Perceptron' or model == 'SGD' or model == 'passiveAggressive':
                fpr, tpr, _ = roc_curve(y[:, 0], y_score)
            else:
                y = []
                for label in self.target:
                    item = []
                    for i in range(len(text_clf.classes_)):
                        if label == text_clf.classes_[i]:
                            item.append(1)
                        else:
                            item.append(0)
                    y.append(item)
                y = np.array(y)
                fpr, tpr, _ = roc_curve(y.ravel(), y_score.ravel())
            
            roc_auc = auc(fpr, tpr)
            trace = go.Scatter(
                x = fpr,
                y = tpr,
                name = 'ROC curve (area =' + str(roc_auc) + ' )',
                line = dict(color=('deeppink'), width = 4)
            )
            data = [trace]

        # multiclasses  
        else:
            fpr = {}
            tpr = {}
            roc_auc = {}
            for i in range(len(self.labels)):
                fpr[self.labels[i]], tpr[self.labels[i]], _ = roc_curve(y[:, i], y_score[:, i])
                roc_auc[self.labels[i]] = auc(fpr[self.labels[i]], tpr[self.labels[i]])
            
            # Compute micro-average ROC curve and ROC area
            fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), y_score.ravel())
            roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([fpr[self.labels[i]] for i in range(len(self.labels))]))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in range(len(self.labels)):
                mean_tpr += interp(all_fpr, fpr[self.labels[i]], tpr[self.labels[i]])

            # Finally average it and compute AUC
            mean_tpr /= len(self.labels)

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr
            roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

            # plotting
            trace0 = go.Scatter(
                x = fpr['micro'],
                y = tpr['micro'],
                name = 'micro-average ROC curve (area =' + str(roc_auc["micro"]) + ' )',
                line = dict(color=('deeppink'), width = 4)
            )
            trace1 = go.Scatter(
                x = fpr['macro'],
                y = tpr['macro'],
                 name = 'macro-average ROC curve (area =' + str(roc_auc["macro"]) + ' )',
                line = dict(
                    color = ('navy'),
                    width = 4,)
            )
            data = [trace0, trace1]
            colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
            for i, color in zip(range(len(self.labels)), colors):
                trace = go.Scatter(
                    x = fpr[self.labels[i]], 
                    y = tpr[self.labels[i]],
                    name = 'ROC curve of class {0} (area = {1:0.2f})'.format(self.labels[i], roc_auc[self.labels[i]]),
                    line = dict(
                        color = (color),
                        width = 4, 
                        dash = 'dash')
                )
                data.append(trace)

                
        layout = dict(title = model + ' model ROC curve',
              xaxis = dict(title = 'False Positive Rate'),
              yaxis = dict(title = 'True Positive Rate'),
              )

        fig = dict(data=data, layout=layout)
        div = plot(fig, output_type='div',image='png',auto_open=False, image_filename='plot_img')
        
        # print the graph file
        fname_div ='div.html'
        with open(self.localSavePath + fname_div,'w') as f:
            f.write(div)
        s3.upload(self.localSavePath, self.awsPath, fname_div)
        div_url = s3.generate_downloads(self.awsPath, fname_div)

        return {'accuracy':accuracy_url, 'pickle':pickle_url, 'div':div_url }
コード例 #18
0
                         client_secret="***REMOVED***")

    # loop through the id and store their comments
    for url, id in zip(urls, ids):
        url = "https://www.reddit.com" + url
        try:
            submission = reddit.submission(url=url)
            if not bfs(submission, id, comments_folder):
                # zip goes here
                zipf = zipfile.ZipFile(temp_dir + fname_zip, 'w',
                                       zipfile.ZIP_DEFLATED)
                zipdir(comments_folder + '/', zipf)
                zipf.close()

                # upload this zip to the s3 corresponding folder
                s3.upload(temp_dir, args.remoteReadPath, fname_zip)
                url = s3.generate_downloads(args.remoteReadPath, fname_zip)
                # delete the files
                d.deletedir('/tmp')
                # send out email notification
                n.notification(args.email,
                               case=1,
                               filename=args.remoteReadPath,
                               links=url,
                               sessionURL=args.sessionURL)
                exit(code='Lack of disk space')
        except:
            # escape those can't be found in url
            pass

    # success and send email notification
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localReadPath):
        os.makedirs(localReadPath)

    fname_config = 'config.json'
    if s3.checkExist(awsPath, fname_config):
        s3.downloadToDisk(fname_config, localSavePath, awsPath)
        with open(localSavePath + fname_config, "r") as fp:
            data = json.load(fp)
            for key in vars(args).keys():
                if key not in data.keys():
                    data[key] = vars(args)[key]
        with open(localSavePath + fname_config,"w") as f:
            json.dump(data,f)
        s3.upload(localSavePath, awsPath, fname_config)
        output['config'] = s3.generate_downloads(awsPath, fname_config)
        output['uuid'] = uid

    else:
        raise ValueError('This session ID is invalid!')
        exit()
        
   
    # download the labeled data from s3 to tmp
    classification = Classification(awsPath, localSavePath, localReadPath, args.remoteReadPath,args.labeledFilename)
    
    output.update(classification.classify(args.model))
    output.update(classification.metrics())
    
    d.deletedir('/tmp')
コード例 #20
0
    parser = argparse.ArgumentParser(description="processing...")
    parser.add_argument('--remoteReadPath', required=True)
    parser.add_argument('--ratio', required=True)
    parser.add_argument('--s3FolderName', required=True)
    parser.add_argument('--email', required=True)
    args = parser.parse_args()

    # arranging the paths
    uid = str(uuid.uuid4())
    awsPath = args.s3FolderName + '/ML/classification/' + uid + '/'
    localSavePath = '/tmp/' + args.s3FolderName + '/ML/classification/' + uid + '/'
    localReadPath = '/tmp/' + args.s3FolderName + '/' + uid + '/'
    if not os.path.exists(localSavePath):
        os.makedirs(localSavePath)
    if not os.path.exists(localReadPath):
        os.makedirs(localReadPath)

    fname = 'config.json'
    with open(localSavePath + fname, "w") as f:
        json.dump(vars(args), f)
    s3.upload(localSavePath, awsPath, fname)
    output['config'] = s3.generate_downloads(awsPath, fname)
    output['uuid'] = uid

    classification = Classification(awsPath, localSavePath, localReadPath,
                                    args.remoteReadPath)
    output.update(classification.split(int(args.ratio)))

    d.deletedir('/tmp')
    n.notification(args.email, case=3, filename=awsPath)
コード例 #21
0
    def split(self, ratio):
        training_set = list(
            random.sample(self.corpus, int(len(self.corpus) * ratio / 100)))
        testing_set = [
            item for item in self.corpus if item not in training_set
        ]

        # plot a pie chart of the split
        labels = ['training set data points', 'unlabeled data points']
        values = [len(training_set), len(testing_set)]
        trace = go.Pie(labels=labels, values=values, textinfo='value')
        div_split = plot([trace],
                         output_type='div',
                         image='png',
                         auto_open=False,
                         image_filename='plot_img')
        fname_div_split = 'div_split.html'
        with open(self.localSavePath + fname_div_split, "w") as f:
            f.write(div_split)
        s3.upload(self.localSavePath, self.awsPath, fname_div_split)
        div_url = s3.generate_downloads(self.awsPath, fname_div_split)

        fname1 = 'TRAINING_' + self.filename
        try:
            with open(self.localSavePath + fname1,
                      'w',
                      newline="",
                      encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for row in training_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        except:
            with open(self.localSavePath + fname1,
                      'w',
                      newline="",
                      encoding='ISO-8859-1') as f:
                writer = csv.writer(f)
                writer.writerow(['text', 'category'])
                for row in training_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        s3.upload(self.localSavePath, self.awsPath, fname1)
        training_url = s3.generate_downloads(self.awsPath, fname1)

        fname2 = 'UNLABELED_' + self.filename
        try:
            with open(self.localSavePath + fname2,
                      'w',
                      newline="",
                      encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['text'])
                for row in testing_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        except:
            with open(self.localSavePath + fname2,
                      'w',
                      newline="",
                      encoding='ISO-8859-1') as f:
                writer = csv.writer(f)
                writer.writerow(['text'])
                for row in testing_set:
                    try:
                        writer.writerow([row])
                    except UnicodeDecodeError:
                        pass
        s3.upload(self.localSavePath, self.awsPath, fname2)
        unlabeled_url = s3.generate_downloads(self.awsPath, fname2)

        return {
            'div': div_url,
            'training': training_url,
            'testing': unlabeled_url
        }
コード例 #22
0
def extract_frequent_phrases(df, hashtag, date_marker, localPath):
    # filter df by hashtag
    new_df = df[df['Contents'].str.contains("#" + hashtag, na=False)]

    most_common = FreqDist(
        tokenize_no_stop(big_string(new_df['Contents'].values)))
    most_common_bigrams = FreqDist(
        ngram(tokenize_no_stop(big_string(new_df['Contents'].values)), 2))
    most_common_trigrams = FreqDist(
        ngram(tokenize_no_stop(big_string(new_df['Contents'].values)), 3))

    indices = []
    counts = []

    for phrases in [
            most_common.most_common(10),
            most_common_bigrams.most_common(10),
            most_common_trigrams.most_common(10)
    ]:
        index = []
        count = []
        for item in phrases:
            if isinstance(item[0], tuple):
                index.append(' '.join(item[0]))
            else:
                index.append(item[0])
            count.append(item[1])

        indices.append(index)
        counts.append(count)

    # upload to s3
    with open(
            os.path.join(
                localPath,
                hashtag + "_" + date_marker + "_extracted_frequent_words.csv"),
            "w") as f:
        writer = csv.writer(f)
        writer.writerow(['word', 'count'])
        for row in most_common.most_common():
            writer.writerow(row)
    s3.upload("macroscope-paho-covid", localPath, "frequent_phrases",
              hashtag + "_" + date_marker + "_extracted_frequent_words.csv")

    with open(
            os.path.join(
                localPath, hashtag + "_" + date_marker +
                "_extracted_frequent_bigrams.csv"), "w") as f:
        writer = csv.writer(f)
        writer.writerow(['bigram', 'count'])
        for row in most_common_bigrams.most_common():
            writer.writerow(row)
    s3.upload("macroscope-paho-covid", localPath, "frequent_phrases",
              hashtag + "_" + date_marker + "_extracted_frequent_bigrams.csv")

    with open(
            os.path.join(
                localPath, hashtag + "_" + date_marker +
                "_extracted_frequent_trigrams.csv"), "w") as f:
        writer = csv.writer(f)
        writer.writerow(['trigram', 'count'])
        for row in most_common_trigrams.most_common():
            writer.writerow(row)
    s3.upload("macroscope-paho-covid", localPath, "frequent_phrases",
              hashtag + "_" + date_marker + "_extracted_frequent_trigrams.csv")

    return indices, counts
コード例 #23
0
            comment_queue = submission.comments[:]  # Seed with top-level
            comments_no_order = [[
                'author', 'body', 'created_utc', 'id', 'link_id', 'parent_id',
                'score', 'subreddit_display_name', 'subreddit_name_prefixed',
                'subreddit_id'
            ]]

            while comment_queue:
                comment = comment_queue.pop(0)
                comments_no_order.append([
                    str(comment.author), comment.body, comment.created_utc,
                    comment.id, comment.link_id, comment.parent_id,
                    comment.score, comment.subreddit.display_name,
                    comment.subreddit_name_prefixed, comment.subreddit_id
                ])
                comment_queue.extend(comment.replies)

            # if folder doesnt exist create it

            # save to csv
            with open(id + '.csv', 'w', newline="", encoding='utf-8') as f:
                writer = csv.writer(f, delimiter=',')
                writer.writerows(comments_no_order)

            # push to s3 bucket
            s3.upload('', 'Comment/' + sub + '/' + folder_id + '/',
                      id + '.csv')

            # delete local file
            remove(id + '.csv')
コード例 #24
0
def lambda_handler(event, context):
    # create local path
    localPath = os.path.join('/tmp', 'frequent_phrases')
    if not os.path.exists(localPath):
        os.makedirs(localPath)

    # download triggered file
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = unquote_plus(event['Records'][0]['s3']['object']['key'])
    remotePath = "/".join(key.split("/")[:-1])

    files = s3.listFiles(bucket, remotePath)
    sorted_files = sorted(files,
                          key=lambda file: file['LastModified'],
                          reverse=True)

    hashtags = ["COVID19", "coronavirus", "COVID_19"]
    date_markers = ["1day", "7days", "30days"]

    for hashtag in hashtags:
        indices_row = []
        counts_row = []
        legends_row = []
        for date_marker in date_markers:
            if date_marker == "1day":
                today = key.split("/")[-1]
                s3.downloadToDisk(bucket, today, localPath, remotePath)
                df_today = pd.read_csv(os.path.join(localPath, today))
                indices, counts = extract_frequent_phrases(
                    df_today, hashtag, date_marker, localPath)
                legends = ["word (1day)", "bigram (1day)", "trigram(1day)"]
            elif date_marker == "7days":
                last_7_days_files = sorted_files[:7]
                last_7_days_list = []
                for file in last_7_days_files:
                    fname = file['Key'].split("/")[-1]
                    s3.downloadToDisk(bucket, fname, localPath, remotePath)
                    last_7_days_list.append(
                        pd.read_csv(os.path.join(localPath, fname)))
                last_7_days_df = pd.concat(last_7_days_list,
                                           axis=0,
                                           ignore_index=True)
                indices, counts = extract_frequent_phrases(
                    last_7_days_df, hashtag, date_marker, localPath)
                legends = ["word (7days)", "bigram (7days)", "trigram(7days)"]
            elif date_marker == "30days":
                last_30_days_files = sorted_files[:30]
                last_30_days_list = []
                for file in last_30_days_files:
                    fname = file['Key'].split("/")[-1]
                    s3.downloadToDisk(bucket, fname, localPath, remotePath)
                    last_30_days_list.append(
                        pd.read_csv(os.path.join(localPath, fname)))
                last_30_days_list = pd.concat(last_30_days_list,
                                              axis=0,
                                              ignore_index=True)
                indices, counts = extract_frequent_phrases(
                    last_30_days_list, hashtag, date_marker, localPath)
                legends = [
                    "word (30days)", "bigram (30days)", "trigram(30days)"
                ]
            else:
                break

            indices_row.append(indices)
            counts_row.append(counts)
            legends_row.append(legends)

        # Plot and save
        title = "Most prevalent 10 frequent words and phrases used in #" + hashtag + " tweets"
        div = plot.plot_multiple_bar_chart(indices_row, counts_row, title,
                                           legends_row)
        with open(
                os.path.join(localPath,
                             hashtag + "_extracted_frequent_phrases.html"),
                'w') as f:
            f.write(div)
        s3.upload("macroscope-paho-covid", localPath, "frequent_phrases",
                  hashtag + "_extracted_frequent_phrases.html")

    return None
コード例 #25
0
def interest_by_region(keywords, language, localPath):
    country_code = pd.read_csv("tableconvert_csv_j8hnfj.csv", quotechar="\"")
    if language.lower() == 'spanish':
        pytrend = TrendReq(hl='sp-SP')
    else:
        pytrend = TrendReq()

    today = date.today()
    march = "2020-03-01"

    # there is a limit on 100 characters for keywords break them to multiple requests then
    while len(keywords) > 0:
        character_len = 0
        keywords_split = []
        for kk in keywords:
            character_len += len(kk)
            if character_len < 50:
                keywords_split.append(kk)
        for item in keywords_split:
            keywords.remove(item)

        pytrend.build_payload(kw_list=keywords_split,
                              timeframe=march + " " +
                              today.strftime("%Y-%m-%d"))
        df_regions = pytrend.interest_by_region(inc_geo_code=True)
        df_regions['country'] = df_regions.index
        df_regions = pd.merge(df_regions,
                              country_code,
                              left_on="geoCode",
                              right_on="Alpha-2 code",
                              how="left")

        for keyword in keywords_split:
            geo_name_df = df_regions[[
                'country', keyword, 'Alpha-2 code', 'Alpha-3 code',
                'Numeric code', 'Latitude (average)', 'Longitude (average)'
            ]]

            if geo_name_df is not None:
                title = "Google Trends Interest by Region related to keyword: " + keyword + " (Since March 2020)"
                div = plot.plot_geograph(geo_name_df, keyword, title)
                with open(
                        os.path.join(
                            localPath,
                            keyword.replace(" ", "_") +
                            "_interest_by_region.html"), 'w') as f:
                    f.write(div)
                s3.upload(
                    "macroscope-paho-covid", localPath, "interest_by_region",
                    keyword.replace(" ", "_") + "_interest_by_region.html")

                # save csv
                geo_name_df.to_csv(os.path.join(
                    localPath,
                    keyword.replace(" ", "_") + "_interest_by_region.csv"),
                                   index=False)
                s3.upload(
                    "macroscope-paho-covid", localPath, "interest_by_region",
                    keyword.replace(" ", "_") + "_interest_by_region.csv")

    return None