def __init__(self, access_token, account_id, alert_id, mention_id, limit=None, before_date=None): """ Parameters ---------- access_token: string Mention API access_token account_id: string ID of the account. alert_id: string ID of the alert. mention_id: string ID of the mention. limit: string Number of mentions to return. max 1000. before_date: string Mentions Before date in 'yyyy-MM-dd HH:mm' format eg. '2018-11-25 12:00' """ self.access_token = access_token self.account_id = account_id self.alert_id = alert_id self.mention_id = mention_id self.limit = limit if before_date is not None: self.before_date = utils.transform_date(before_date) else: self.before_date = before_date super(FetchMentionChildrenAPI, self).__init__(access_token)
def get_topics(self, list_url): nb_page = len(list_url) for num_page, url in enumerate(list_url): num_page += 1 obj_page = urlopen(url) soup = BeautifulSoup.BeautifulSoup(obj_page) name_zone = soup.findAll("div", {"id": "vf"})[0].h2.span.string search_category = False if name_zone == u"Résultats de la recherche": # u'Résultats de la recherche': search_category = True else: category = name_zone id_category = url.split("id=")[-1].split("&")[0] sys.stdout.write( "\rObtention des pages ▕" + "█" * num_page + " " * (nb_page - num_page) + "▏ " + str(num_page) + "/" + str(nb_page) ) sys.stdout.flush() for item in soup.findAll("div", "tclcon"): is_move = False if item.contents[0] and u"Déplacé" in item.contents[0].strip(): is_move = True tr_parent = item.findParents("tr")[0] topic_id = item.a["href"].split("id=")[-1] titre = htmlentitydecode(item.a.string) auteur = item.span.contents[0].replace("par ", "") is_closed = False is_closed = tr_parent.get("class") == "iclosed" if not is_move: balise_td = tr_parent.findAll("td", "tcr")[0] date = balise_td.a.string obj_date = transform_date(date) else: obj_date = None if search_category: td_category = tr_parent.findAll("td", "tc2")[0] category = td_category.a.string id_category = td_category.a["href"].split("id=")[-1] yield { "id": topic_id, "auteur": auteur, "titre": titre, "is_closed": is_closed, "date_last": obj_date, "is_move": is_move, "id_category": id_category, "category": category, "num_page": num_page, } print ("")
def get_topics(self, list_url): nb_page = len(list_url) for num_page, url in enumerate(list_url): num_page += 1 obj_page = urlopen(url) soup = BeautifulSoup.BeautifulSoup( obj_page ) name_zone = soup.findAll("div",{"id":"vf"})[0].h2.span.string search_category = False if name_zone == u'Résultats de la recherche': search_category = True else: category = name_zone id_category = url.split('id=')[-1].split("&")[0] sys.stdout.write('\rObtention des pages ▕'+'█'*num_page+' '*(nb_page-num_page)\ +'▏ '+str(num_page)+'/'+str(nb_page)) sys.stdout.flush() for item in soup.findAll("div","tclcon"): is_move = False if item.contents[0] and \ u"Déplacé" in item.contents[0].strip(): is_move = True tr_parent = item.findParents("tr")[0] topic_id = item.a['href'].split("id=")[-1] titre = htmlentitydecode(item.a.string) auteur = item.span.contents[0].replace("par ","") is_closed = False is_closed = tr_parent.get("class") == "iclosed" if not is_move: balise_td = tr_parent.findAll("td", "tcr")[0] date = balise_td.a.string obj_date = transform_date(date) else: obj_date = None if search_category: td_category = tr_parent.findAll('td', 'tc2')[0] category = td_category.a.string id_category = td_category.a['href'].split('id=')[-1] yield {'id':topic_id, 'auteur':auteur, 'titre':titre, 'is_closed':is_closed, 'date_last':obj_date, 'is_move': is_move, 'id_category': id_category, 'category': category, 'num_page': num_page} print('')
import pandas as pd from utils import transform_date, add_hours dfs = [] df_idf_horaire_pm10 = pd.read_csv("csv/idf_horaire_pm10.csv", delimiter=';')[[ 'nom_station', 'valeur', 'date_debut' ]] df_idf_horaire_pm10 = df_idf_horaire_pm10.pivot_table(index='date_debut', columns='nom_station', values='valeur') df_idf_horaire_pm10.reset_index(inplace=True) df_idf_horaire_pm10['date_debut'] = df_idf_horaire_pm10['date_debut'].apply( lambda x: transform_date(x, 1)) dfs.append(df_idf_horaire_pm10) df_aura_horaire_pm10 = pd.read_csv( "csv/aura_horaire_pm10.csv", delimiter=';')[['nom_station', 'valeur', 'date_debut']] df_aura_horaire_pm10 = df_aura_horaire_pm10.pivot_table(index='date_debut', columns='nom_station', values='valeur') df_aura_horaire_pm10.reset_index(inplace=True) df_aura_horaire_pm10['date_debut'] = df_aura_horaire_pm10['date_debut'].apply( lambda x: transform_date(x, 1)) dfs.append(df_aura_horaire_pm10) df_bfc_horaire_pm10 = pd.read_csv("csv/bfc_horaire_pm10.csv", delimiter=';') df_bfc_horaire_pm10.replace('Non disponible', 'NaN') df_bfc_horaire_pm10.rename(columns={'Date': 'date_debut'}, inplace=True) dfs.append(df_bfc_horaire_pm10)
class TestTransformDate(unittest.TestCase): result = utils.transform_date('2018-11-25 12:00') def test_date(self): self.assertEqual(result, '2018-11-25T12%3A00%3A00.12345%2B00%3A00')
def __init__( self, access_token, account_id, alert_id, since_id=None, limit='20', before_date=None, # 2018-07-07T00:00:00.12345+02:00 not_before_date=None, # #2018-07-01T00:00:00.12345+02:00 source=None, unread=None, favorite=None, folder=None, tone=None, countries=None, include_children=None, sort=None, languages=None, timezone=None, q=None, cursor=None): """ Parameters ---------- access_token: string Mention API access_token alert_id: string ID of the alert. since_id: string Returns mentions ordered by id Can not be combined with before_date, not_before_date, cursor. limit: string Number of mentions to return. max 1000. before_date: string Mentions Before date in 'yyyy-MM-dd HH:mm' format eg. '2018-11-25 12:00' not_before_date: string Mentions Not before date in 'yyyy-MM-dd HH:mm' format eg. '2018-10-04 12:00' source: string Must be either web, twitter, blogs, forums, news, facebook, images or videos unread: boolean return only unread mentions. Must not be combined with favorite, q, and tone. favorite: boolean Whether to return only favorite mentions. Can not be combined with folder, when folder is not inbox or archive folder: string Filter by folder. Can be: inbox, archive, spam, trash. With spam and trash, include_children is enabled by default. tone: string Filter by tone. Must be one of 'negative', 'neutral', 'positive' countries: string Filter by country include_children: boolean include children mentions. sort: string Sort results. Must be one of published_at, author_influence.score, direct_reach, cumulative_reach, domain_reach. languages: string Filter by language timezone: string Filter by timezone q: string Filter by q cursor: string Filter by cursor """ self.access_token = access_token self.account_id = account_id self.alert_id = alert_id self.limit = limit self.since_id = since_id if before_date is not None: self.before_date = utils.transform_date(before_date) else: self.before_date = before_date if not_before_date is not None: self.not_before_date = utils.transform_date(not_before_date) else: self.not_before_date = not_before_date self.source = source if unread is not None: self.unread = utils.transform_boolean(unread) else: self.unread = unread if favorite is not None: self.favorite = utils.transform_boolean(favorite) else: self.favorite = favorite self.folder = folder if tone is not None: self.tone = tone = utils.transform_tone(tone) else: self.tone = tone self.countries = countries if include_children is not None: self.include_children = utils.transform_boolean(include_children) else: self.include_children = include_children self.sort = sort self.languages = languages self.timezone = timezone self.q = q self.cursor = cursor super(FetchAllMentionsAPI, self).__init__(access_token)
json_file_item = None with open('../artifacts/anon_dict.json') as json_file: json_file_item = json.load(json_file) analysis = "" for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith('.csv') & (True == filename.startswith('results_')): data_files.append(pd.read_csv(directory+filename)) analysis = filename.split('.csv')[0] df = pd.concat(data_files, sort=False) df['clean_text'] = df['text'].map(lambda x: clean_text(x)) df['date'] = df['timestamp'].apply(lambda x: transform_date(x)) df['year'] = df['date'].apply(lambda x: x.year) df = df.loc[df['year'] >= df['year'].max(), ] df['hashtags'] = df['text'].map(lambda x: get_hashtags_operations(x)) terms_attacks = json_file_item["attacks"] df['attack'] = df['clean_text'].map(lambda x: check_attack(x, terms_attacks)) df['operations'] = df['hashtags'].map(lambda x: True if len( [hashtag for hashtag in x if '#op' == hashtag[:3]]) > 0 else False) df['RT'] = df['clean_text'].map(lambda x: True if 'rt' in x else False) # Translate RTs to Attacks df['user'] = df[(df['RT'] == True)]['text'].apply( lambda x: transform_user_rt_to_tweet(x))