Exemple #1
0
    def __init__(self, access_token, account_id, alert_id, mention_id,
                 limit=None, before_date=None):
        """
        Parameters
        ----------
        access_token: string
            Mention API access_token

        account_id: string
            ID of the account.

        alert_id: string
            ID of the alert.

        mention_id: string
            ID of the mention.

        limit: string
            Number of mentions to return. max 1000.

        before_date: string
            Mentions Before date in 'yyyy-MM-dd HH:mm' format
            eg. '2018-11-25 12:00'
        """
        self.access_token = access_token
        self.account_id = account_id
        self.alert_id = alert_id
        self.mention_id = mention_id
        self.limit = limit
        
        if before_date is not None:
            self.before_date = utils.transform_date(before_date)
        else:
            self.before_date = before_date
        super(FetchMentionChildrenAPI, self).__init__(access_token)
Exemple #2
0
    def get_topics(self, list_url):
        nb_page = len(list_url)
        for num_page, url in enumerate(list_url):
            num_page += 1
            obj_page = urlopen(url)
            soup = BeautifulSoup.BeautifulSoup(obj_page)
            name_zone = soup.findAll("div", {"id": "vf"})[0].h2.span.string
            search_category = False
            if name_zone == u"Résultats de la recherche":  # u'Résultats de la recherche':
                search_category = True
            else:
                category = name_zone
                id_category = url.split("id=")[-1].split("&")[0]
            sys.stdout.write(
                "\rObtention des pages ▕"
                + "█" * num_page
                + " " * (nb_page - num_page)
                + "▏ "
                + str(num_page)
                + "/"
                + str(nb_page)
            )
            sys.stdout.flush()

            for item in soup.findAll("div", "tclcon"):
                is_move = False
                if item.contents[0] and u"Déplacé" in item.contents[0].strip():
                    is_move = True
                tr_parent = item.findParents("tr")[0]

                topic_id = item.a["href"].split("id=")[-1]
                titre = htmlentitydecode(item.a.string)
                auteur = item.span.contents[0].replace("par ", "")

                is_closed = False
                is_closed = tr_parent.get("class") == "iclosed"
                if not is_move:
                    balise_td = tr_parent.findAll("td", "tcr")[0]
                    date = balise_td.a.string
                    obj_date = transform_date(date)
                else:
                    obj_date = None
                if search_category:
                    td_category = tr_parent.findAll("td", "tc2")[0]
                    category = td_category.a.string
                    id_category = td_category.a["href"].split("id=")[-1]

                yield {
                    "id": topic_id,
                    "auteur": auteur,
                    "titre": titre,
                    "is_closed": is_closed,
                    "date_last": obj_date,
                    "is_move": is_move,
                    "id_category": id_category,
                    "category": category,
                    "num_page": num_page,
                }
        print ("")
Exemple #3
0
    def get_topics(self, list_url):
        nb_page = len(list_url)
        for num_page, url in enumerate(list_url):
            num_page += 1
            obj_page = urlopen(url)
            soup = BeautifulSoup.BeautifulSoup( obj_page )
            name_zone  = soup.findAll("div",{"id":"vf"})[0].h2.span.string
            search_category = False
            if name_zone == u'Résultats de la recherche':
                search_category = True
            else:
                category = name_zone
                id_category = url.split('id=')[-1].split("&")[0]
            sys.stdout.write('\rObtention des pages ▕'+'█'*num_page+' '*(nb_page-num_page)\
                       +'▏ '+str(num_page)+'/'+str(nb_page))
            sys.stdout.flush()

            for item in soup.findAll("div","tclcon"):
                is_move = False
                if item.contents[0] and \
                    u"Déplacé" in  item.contents[0].strip():
                    is_move = True
                tr_parent = item.findParents("tr")[0]

                topic_id = item.a['href'].split("id=")[-1]
                titre = htmlentitydecode(item.a.string)
                auteur = item.span.contents[0].replace("par ","")

                is_closed = False
                is_closed = tr_parent.get("class") == "iclosed"
                if not is_move:
                    balise_td = tr_parent.findAll("td", "tcr")[0]
                    date = balise_td.a.string
                    obj_date = transform_date(date)
                else:
                    obj_date = None
                if search_category:
                    td_category = tr_parent.findAll('td', 'tc2')[0]
                    category = td_category.a.string
                    id_category = td_category.a['href'].split('id=')[-1]


                yield {'id':topic_id, 'auteur':auteur, 'titre':titre,
                       'is_closed':is_closed, 'date_last':obj_date,
                       'is_move': is_move, 'id_category': id_category,
                       'category': category, 'num_page': num_page}
        print('')
import pandas as pd
from utils import transform_date, add_hours

dfs = []

df_idf_horaire_pm10 = pd.read_csv("csv/idf_horaire_pm10.csv", delimiter=';')[[
    'nom_station', 'valeur', 'date_debut'
]]
df_idf_horaire_pm10 = df_idf_horaire_pm10.pivot_table(index='date_debut',
                                                      columns='nom_station',
                                                      values='valeur')
df_idf_horaire_pm10.reset_index(inplace=True)
df_idf_horaire_pm10['date_debut'] = df_idf_horaire_pm10['date_debut'].apply(
    lambda x: transform_date(x, 1))
dfs.append(df_idf_horaire_pm10)

df_aura_horaire_pm10 = pd.read_csv(
    "csv/aura_horaire_pm10.csv",
    delimiter=';')[['nom_station', 'valeur', 'date_debut']]
df_aura_horaire_pm10 = df_aura_horaire_pm10.pivot_table(index='date_debut',
                                                        columns='nom_station',
                                                        values='valeur')
df_aura_horaire_pm10.reset_index(inplace=True)
df_aura_horaire_pm10['date_debut'] = df_aura_horaire_pm10['date_debut'].apply(
    lambda x: transform_date(x, 1))
dfs.append(df_aura_horaire_pm10)

df_bfc_horaire_pm10 = pd.read_csv("csv/bfc_horaire_pm10.csv", delimiter=';')
df_bfc_horaire_pm10.replace('Non disponible', 'NaN')
df_bfc_horaire_pm10.rename(columns={'Date': 'date_debut'}, inplace=True)
dfs.append(df_bfc_horaire_pm10)
Exemple #5
0
class TestTransformDate(unittest.TestCase):

    result = utils.transform_date('2018-11-25 12:00')

    def test_date(self):
        self.assertEqual(result, '2018-11-25T12%3A00%3A00.12345%2B00%3A00')
Exemple #6
0
    def __init__(
            self,
            access_token,
            account_id,
            alert_id,
            since_id=None,
            limit='20',
            before_date=None,  # 2018-07-07T00:00:00.12345+02:00
            not_before_date=None,  # #2018-07-01T00:00:00.12345+02:00
            source=None,
            unread=None,
            favorite=None,
            folder=None,
            tone=None,
            countries=None,
            include_children=None,
            sort=None,
            languages=None,
            timezone=None,
            q=None,
            cursor=None):
        """
        Parameters
        ----------
        access_token: string
            Mention API access_token

        alert_id: string
            ID of the alert.

        since_id: string
            Returns mentions ordered by id
            Can not be combined with before_date, not_before_date, cursor.

        limit: string
            Number of mentions to return. max 1000.

        before_date: string
            Mentions Before date in 'yyyy-MM-dd HH:mm' format
            eg. '2018-11-25 12:00'

        not_before_date: string
            Mentions Not before date in 'yyyy-MM-dd HH:mm' format
            eg. '2018-10-04 12:00'

        source: string
            Must be either web, twitter, blogs, forums, news, facebook, images or videos

        unread: boolean
            return only unread mentions.
            Must not be combined with favorite, q, and tone.

        favorite: boolean
            Whether to return only favorite mentions.
            Can not be combined with folder, when folder is not inbox or archive

        folder: string
            Filter by folder. Can be: inbox, archive, spam, trash.
            With spam and trash, include_children is enabled by default.

        tone: string
            Filter by tone. Must be one of 'negative', 'neutral', 'positive'

        countries: string
            Filter by country

        include_children: boolean
            include children mentions.

        sort: string
            Sort results. Must be one of published_at, author_influence.score,
            direct_reach, cumulative_reach, domain_reach.

        languages: string
            Filter by language

        timezone: string
            Filter by timezone

        q: string
            Filter by q

        cursor: string
            Filter by cursor
        """
        self.access_token = access_token
        self.account_id = account_id
        self.alert_id = alert_id

        self.limit = limit

        self.since_id = since_id

        if before_date is not None:
            self.before_date = utils.transform_date(before_date)
        else:
            self.before_date = before_date

        if not_before_date is not None:
            self.not_before_date = utils.transform_date(not_before_date)
        else:
            self.not_before_date = not_before_date

        self.source = source

        if unread is not None:
            self.unread = utils.transform_boolean(unread)
        else:
            self.unread = unread

        if favorite is not None:
            self.favorite = utils.transform_boolean(favorite)
        else:
            self.favorite = favorite

        self.folder = folder

        if tone is not None:
            self.tone = tone = utils.transform_tone(tone)
        else:
            self.tone = tone

        self.countries = countries

        if include_children is not None:
            self.include_children = utils.transform_boolean(include_children)
        else:
            self.include_children = include_children

        self.sort = sort
        self.languages = languages
        self.timezone = timezone
        self.q = q
        self.cursor = cursor
        super(FetchAllMentionsAPI, self).__init__(access_token)
Exemple #7
0
json_file_item = None
with open('../artifacts/anon_dict.json') as json_file:
    json_file_item = json.load(json_file)

analysis = ""
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.csv') & (True == filename.startswith('results_')):
        data_files.append(pd.read_csv(directory+filename))
        analysis = filename.split('.csv')[0]

df = pd.concat(data_files, sort=False)

df['clean_text'] = df['text'].map(lambda x: clean_text(x))

df['date'] = df['timestamp'].apply(lambda x: transform_date(x))
df['year'] = df['date'].apply(lambda x: x.year)
df = df.loc[df['year'] >= df['year'].max(), ]

df['hashtags'] = df['text'].map(lambda x: get_hashtags_operations(x))

terms_attacks = json_file_item["attacks"]

df['attack'] = df['clean_text'].map(lambda x: check_attack(x, terms_attacks))
df['operations'] = df['hashtags'].map(lambda x: True if len(
    [hashtag for hashtag in x if '#op' == hashtag[:3]]) > 0 else False)
df['RT'] = df['clean_text'].map(lambda x: True if 'rt' in x else False)

# Translate RTs to Attacks
df['user'] = df[(df['RT'] == True)]['text'].apply(
    lambda x: transform_user_rt_to_tweet(x))