Example #1
0
def ggl_trends(grouped, keyword):
    pytrends = TrendReq(hl='en-US', tz=360)
    kw_list = [keyword]
    pytrends.build_payload(kw_list, cat=0, timeframe='all', geo='US', gprop='')
    ggl_trends = pytrends.interest_over_time()
    if ggl_trends.empty:
        return pd.DataFrame() 
    grouped_ggl_trends = ggl_trends.groupby(pd.Grouper(freq='1m')).mean().rename(columns={keyword: 'Google Trends'})
    return grouped.merge(grouped_ggl_trends, left_index=True, right_index=True, how='inner')
Example #2
0
class GoogleTrendStatsEvaluator(StatsSocialEvaluator):
    def __init__(self):
        super().__init__()
        self.pytrends = None
        self.is_threaded = False

    # Use pytrends lib (https://github.com/GeneralMills/pytrends)
    # https://github.com/GeneralMills/pytrends/blob/master/examples/example.py
    def get_data(self):
        self.pytrends = TrendReq(hl='en-US', tz=0)
        # self.pytrends.GENERAL_URL = "https://trends.google.com/trends/explore"
        # self.symbol
        key_words = [self.symbol]
        try:
            # looks like only 1 and 3 months are working ...
            time_frame = "today " + str(self.social_config[STATS_EVALUATOR_HISTORY_TIME]) + "-m"
            # Attention apparement limite de request / h assez faible
            self.pytrends.build_payload(kw_list=key_words, cat=0, timeframe=time_frame, geo='', gprop='')
        except ResponseError as e:
            self.logger.warn(str(e))

    def eval_impl(self):
        interest_over_time_df = self.pytrends.interest_over_time()

        # compute bollinger bands
        self.eval_note = AdvancedManager.get_class(self.config, StatisticAnalysis).analyse_recent_trend_changes(
            interest_over_time_df[self.symbol], numpy.sqrt)

    def run(self):
        pass

    # check if history is not too high
    def load_config(self):
        super(GoogleTrendStatsEvaluator, self).load_config()
        if self.social_config[STATS_EVALUATOR_HISTORY_TIME] > STATS_EVALUATOR_MAX_HISTORY_TIME:
            self.social_config[STATS_EVALUATOR_HISTORY_TIME] = STATS_EVALUATOR_MAX_HISTORY_TIME

    def set_default_config(self):
        self.social_config = {
            CONFIG_REFRESH_RATE: 3600,
            STATS_EVALUATOR_HISTORY_TIME: 3
        }
def scrape():
    cur_dir = os.getcwd()
    os.chdir(cur_dir)
    if request.method == 'POST':
        MONGODB_HOST = 'localhost'
        MONGODB_PORT = 27017
        DBS_NAME = 'donorschoose'
        COLLECTION_NAME = 'projects'
        FIELDS = {}
        connection = MongoClient(MONGODB_HOST, MONGODB_PORT)
        collection = connection[DBS_NAME][COLLECTION_NAME]
        projects = collection.find(projection=FIELDS, limit=5000)

        #Reading in the classification model
        model = pickle.load(open(cur_dir + "/fakenews_model.dat", "rb"))

        #Reading in the URL input by the user
        entered_url = request.form['text']

        #Creating an article object
        article = newspaper.article.Article(url=entered_url)
        article.download()
        article.parse()
        article.nlp()

        #Extracting the article text
        text = article.text
        d = {'text': [text]}
        text_df = pd.DataFrame(data=d)
        externaltestinput = text_df

        #Begin Classification of Article
        #Clean the data and extract features
        externaltest = fe.clean_text_add_features(externaltestinput)
        #Vectorize the word
        externaltest = fe.word_vectors(externaltest)

        #Structuring the data
        externaldata = externaltest[[
            'specialchar', 'wordcount', 'avewordlength',
            'firstpersonwordcount', 'uniquewords', 'capitalizedwords', 'vector'
        ]]
        externaldata2 = pd.concat(
            [externaldata['vector'].apply(pd.Series), externaldata], axis=1)
        externaldata2.drop('vector', axis=1, inplace=True)
        externaldata2.dropna(inplace=True)
        externaldata_predictor = externaldata2.as_matrix()

        SC = externaldata2['specialchar'][0]
        CW = externaldata2['capitalizedwords'][0]
        AWL = externaldata2['avewordlength'][0]

        awl_data = [
            100 * np.abs(1 - float(5 / AWL)), 100 * np.abs(float(5 / AWL))
        ]
        cw_data = [100 * float(CW), 1000 * (1 - float(CW))]
        sc_data = [(1000 * np.abs(SC)), 100 - (1000 * np.abs(SC))]

        #Running the model and storing the result
        xgb_externaltest_output = model.predict(externaldata_predictor)

        if (xgb_externaltest_output):
            classification = "FAKE"
        else:
            classification = "REAL"
        # getting related words
        tagged_art = pos_tag(article.text.split())
        all_nouns = [
            word for word, pos in tagged_art if (pos == 'NN' or pos == 'NNP')
        ]
        count = Counter(all_nouns)
        most_cmn = count.most_common()[0:5]

        noun_list = [None] * 5
        for i in range(0, 5):
            noun_list[i] = str(most_cmn[i][0])

        all_combos = list(combinations(noun_list, 2))

        places = GeoText(article.text)
        all_cities = places.cities

        pytrend = TrendReq()
        #tf = 'now 7-d'
        tf = 'today 1-m'

        all_cities = list(set(all_cities))
        # checking if any combinations are cities

        prev_max = 0
        occur = dict(most_cmn)

        i = 0
        for item in all_combos:

            for city in all_cities:
                c1 = "" + item[0] + " " + item[1]
                c2 = "" + item[1] + " " + item[0]
                if (c1 in city or c2 in city):
                    # remove both items, add back city
                    # and higher occurence item
                    # if equal, remove both
                    noun_list.remove(item[0])
                    noun_list.remove(item[1])
                    city_name = str(city).replace(" City", "")

                    #if there are the same number of occurences
                    if (occur[item[0]] == occur[item[1]]):
                        noun_list.append(city_name)

                    elif (occur[item[0]] > occur[item[1]]):
                        noun_list.append(city_name)
                        noun_list.append(item[0])
                    else:
                        noun_list.append(city_name)
                        noun_list.append(item[1])

            i = i + 1

        final_combos = list(combinations(noun_list, 2))
        all_results = pd.DataFrame(
            columns=['search_term', 'interest', 'related'],
            index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        j = 0

        for item in final_combos:
            query = item[0] + " " + item[1]
            query = query.translate(string.punctuation)
            pytrend.build_payload(kw_list=[query], timeframe=tf)
            interest_over_time_df = pytrend.interest_over_time()

            related_queries_dict = pytrend.related_queries()

            # using the rising trends to observe the most popular search
            rise_query_df = related_queries_dict[query]['rising']
            all_results.iloc[j]['search_term'] = query
            all_results.iloc[j]['interest'] = interest_over_time_df
            all_results.iloc[j]['related'] = rise_query_df

            if (rise_query_df is not None):
                max_line = rise_query_df.loc[rise_query_df['value'].idxmax()]
                max_val = max_line['value']
                max_val_query = max_line['query']

                if (max_val > prev_max):
                    prev_max = max_val
                    prev_query = max_val_query
                    best = related_queries_dict
            j = j + 1

        pytrend.build_payload(kw_list=[prev_query], timeframe=tf)
        interest_over_time_df = pytrend.interest_over_time()
        df = interest_over_time_df.drop(labels='isPartial', axis=1)

        py.iplot([{
            'x': df.index,
            'y': df[col],
            'name': col
        } for col in df.columns],
                 filename='simple-line')

        import networkx as nx

        G = nx.Graph()

        for i in range(all_results.shape[0]):
            n1 = all_results.iloc[i]
            G.add_node(n1['search_term'])
            if (i != 0):
                edge = (n1['search_term'],
                        all_results.iloc[i - 1]['search_term'])
                G.add_edge(*edge)
            #else:
            #    G.node[0]['pos'] = (10, 5)

            if (all_results.iloc[i]['related'] is not None):
                for j in range(n1['related'].shape[0]):
                    n2 = n1['related'].iloc[j]
                    G.add_node(n2['query'])
                    edge = (n1['search_term'], n2['query'])
                    G.add_edge(*edge)

        pos = nx.spring_layout(G, k=2, iterations=20)
        nx.draw(G, node_size=1000, node_color='c', pos=pos, with_labels=True)
        #plt.savefig("simple_path.png")
        #plt.show()

        edge_trace = Scatter(x=[],
                             y=[],
                             line=Line(width=0.5, color='#888'),
                             hoverinfo='none',
                             mode='lines')

        node_trace = Scatter(x=[],
                             y=[],
                             text=list(G.nodes()),
                             mode='markers',
                             hoverinfo='text',
                             marker=Marker(showscale=False,
                                           colorscale='YIGnBu',
                                           reversescale=True,
                                           color=[],
                                           size=20,
                                           colorbar=dict(
                                               thickness=15,
                                               title='Node Connections',
                                               xanchor='left',
                                               titleside='right'),
                                           line=dict(width=2)))

        for node in G.nodes():
            node_trace['x'].append(pos[node][0])
            node_trace['y'].append(pos[node][1])

        for edge in G.edges():
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_trace['x'] += [x0, x1, None]
            edge_trace['y'] += [y0, y1, None]

        for node in enumerate(G.nodes()):
            node_trace['marker']['color'].append(0)
            node_info = str(node)
            node_trace['text'].append(node_info)

        fig = Figure(data=Data([edge_trace, node_trace]),
                     layout=Layout(showlegend=False,
                                   hovermode='closest',
                                   margin=dict(b=20, l=5, r=5, t=40),
                                   annotations=[
                                       dict(showarrow=False,
                                            xref="paper",
                                            yref="paper",
                                            x=0.005,
                                            y=-0.002)
                                   ],
                                   xaxis=XAxis(showgrid=False,
                                               zeroline=False,
                                               showticklabels=False),
                                   yaxis=YAxis(showgrid=False,
                                               zeroline=False,
                                               showticklabels=False)))

        py.iplot(fig, filename='networkx')

        #Update plotly plots based on the results from the model
        fig = {
            "data": [{
                "values": [],
                "hoverinfo": "none",
                "marker": {
                    "colors": []
                },
                "textinfo": "none",
                "hole": 0.6,
                "type": "pie"
            }],
            "layout": {
                "showlegend":
                False,
                "annotations": [{
                    "text": "",
                    "font": {
                        "size": 10
                    },
                    "showarrow": False
                }],
                "title":
                classification,
                "titlefont": {
                    "family": "Courier New",
                    "size": 180
                },
                "margin": {
                    "t": 600
                }
            }
        }
        py.iplot(fig, filename='classification')

        #import plotly.plotly as py
        #from plotly.graph_objs import *
        trace1 = {
            "domain": {
                "x": [0, 0.31],
                "y": [0.1, 1]
            },
            "hole": 0.6,
            "hoverinfo": "none",
            "labels": ["Data", ""],
            "marker": {
                "colors": ["rgb(53, 196, 170)", "rgb(255, 255, 255)"],
                "line": {
                    "color": ["rgb(0, 0, 0)"],
                    "width": 2
                }
            },
            "name": "CW",
            "textinfo": "none",
            "type": "pie",
            "values": cw_data
        }
        trace2 = {
            "domain": {
                "x": [0.33, 0.64],
                "y": [0.1, 1]
            },
            "hole": 0.6,
            "hoverinfo": "none",
            "labels": ["Data", ""],
            "marker": {
                "colors": ["rgb(53, 196, 170)", "rgb(255, 255, 255)"],
                "line": {
                    "color": ["rgb(0, 0, 0)"],
                    "width": 2
                }
            },
            "name": "SC",
            "textinfo": "none",
            "type": "pie",
            "values": sc_data
        }
        trace3 = {
            "domain": {
                "x": [0.66, 1],
                "y": [0.1, 1]
            },
            "hole": 0.6,
            "hoverinfo": "none",
            "labels": ["Data", ""],
            "marker": {
                "colors": ["rgb(53, 196, 170)", "rgb(255, 255, 255)"],
                "line": {
                    "color": ["rgb(0, 0, 0)"],
                    "width": 2
                }
            },
            "name": "AWL",
            "textinfo": "none",
            "type": "pie",
            "values": awl_data
        }
        fig = {
            "data": [trace1, trace2, trace3],
            "layout": {
                "annotations": [{
                    "x": 0.1,
                    "y": 0.12,
                    "font": {
                        "size": 16
                    },
                    "showarrow": False,
                    "text": "# Cap. Words"
                }, {
                    "x": 0.12,
                    "y": 0.04,
                    "font": {
                        "size": 16
                    },
                    "showarrow": False,
                    "text": str(np.round(cw_data[0], 0)) + "%"
                }, {
                    "x": 0.47,
                    "y": 0.12,
                    "font": {
                        "size": 16
                    },
                    "showarrow": False,
                    "text": "# Special Char"
                }, {
                    "x": 0.46,
                    "y": 0.04,
                    "font": {
                        "size": 16
                    },
                    "showarrow": False,
                    "text": str(np.round(sc_data[0], 0)) + "%"
                }, {
                    "x": 0.9,
                    "y": 0.12,
                    "font": {
                        "size": 16
                    },
                    "showarrow": False,
                    "text": "Avg. Word Len"
                }, {
                    "x": 0.85,
                    "y": 0.04,
                    "font": {
                        "size": 16
                    },
                    "showarrow": False,
                    "text": str(np.round(awl_data[0], 0)) + "%"
                }],
                "showlegend":
                False,
                "title":
                "Score Component Breakdown"
            }
        }
        py.iplot(fig, filename='score_bd')

    connection.close()

    return render_template("output.html", article=article)
Example #4
0
 def test_interest_over_time(self):
     pytrend = TrendReq()
     pytrend.build_payload(kw_list=['pizza', 'bagel'])
     self.assertIsNotNone(pytrend.interest_over_time())
Example #5
0
class GoogleTrendsData(object):
    '''Class to get data from Google Trends concurrently.'''
    def __init__(self,
                 kw: list,
                 normalize: bool,
                 category=0,
                 timezone=0,
                 timeframe='today 5-y',
                 geo='US',
                 gprop=''):
        self.kw = kw
        self.normalize = normalize
        self.cat = category
        self.tf = timeframe
        self.geo = geo
        self.gprop = gprop
        self.pytrends = TrendReq(hl='en-US', tz=timezone)

    def __repr__(self):
        return f'Lookup {self.kw}, {"normalized" if self.normalize else "not normalized"}.'

    def norm_str(self, space=False):
        if space:
            return 'Normalized' if self.normalize else 'Not Normalized'
        return 'Normalized' if self.normalize else 'NotNormalized'

    # Multiprocessing
    def get(self, processes=10):
        """Handles multiprocessing using ThreadPool; sends items from a list to a function and gets the results as a list"""
        # If we already have the data, get it from the CSV file without talking to Google
        file_name = 'output/' + ''.join(self.kw) + (self.norm_str()) + '.csv'

        try:
            data = pd.read_csv(file_name)
            print('Data cached. Reading csv...')
            # Convert the date column from str to datetime
            data['date'] = pd.to_datetime(data['date'])
            data.set_index('date', drop=True, inplace=True)
            return data
        except FileNotFoundError:
            print('Connecting to Google.')
            pass

        # If we want to normalize, bypass threading
        if self.normalize:
            result = self.gen_data(self.kw)

            # If we get an array back instead of a DataFrame we are rate limited
            try:
                result.drop('isPartial', axis=1)
            except AttributeError:
                sys.exit('Rate limited.')

            return result.drop('isPartial', axis=1)

        # Define the number of processes, use less than or equal to the defined value
        count_threads = min(processes, len(self.kw))
        if count_threads == 0:
            return []
        pool = ThreadPool(count_threads)

        # Tell the user what is happening
        print(f"Getting {len(self.kw)} items in {count_threads} processes.")

        # Calls gen_data() and adds the filesize returned each call to an self.kw
        result = list(pool.imap_unordered(self.gen_data, self.kw))
        pool.close()
        pool.join()

        # Result is a list of each different Pandas Dataframe, so we concatenate them together
        try:
            result = pd.concat(result, axis=1, join='inner').drop('isPartial',
                                                                  axis=1)
        except TypeError:
            sys.exit('Rate limited.')

        return result

    def gen_data(self, keywords):
        '''Generate a Pandas Dataframe based on the keyword(s) passed'''
        # Handle when we are passed a list of single letters
        if len(keywords[0]) == 1:
            keywords = [''.join(keywords)]

        if self.normalize:
            # Raise error before we send the request
            if len(keywords) > 5 and isinstance(keywords, list):
                raise ValueError('Too many keywords for normalizaion.')

            try:
                self.pytrends.build_payload(keywords,
                                            cat=self.cat,
                                            timeframe=self.tf,
                                            geo=self.geo,
                                            gprop=self.gprop)
            except ResponseError:
                return []

            data = self.pytrends.interest_over_time()
            return data

        # Handle when we are not normalizing the data
        else:
            for keyword in keywords:
                print(f'Getting {keyword}')

                # Build the dataset with the first keyword
                if keyword == keywords[0]:
                    try:
                        self.pytrends.build_payload([keyword],
                                                    cat=self.cat,
                                                    timeframe=self.tf,
                                                    geo=self.geo,
                                                    gprop=self.gprop)
                    except ResponseError:
                        return []

                    data = self.pytrends.interest_over_time()
                    continue

                # After we have the dataset we append the new data
                self.pytrends.build_payload([keyword], self.cat, self.tf,
                                            self.geo, self.gprop)
                data[keyword] = self.pytrends.interest_over_time()[keyword]

            # Rearrange columns
            cols = list(data.columns.values)
            cols.append(cols.pop(cols.index('isPartial')))

            return data[cols]

    def graph(self, data, filename='o'):
        p = data.plot(x=data.index)
        p.set_title(f'Interest Over Time: {self.norm_str(True)}')
        p.set_ylabel('Interest Level')
        p.set_xlabel('Date')
        p.get_figure().savefig(f'{filename}.png')
        return p

    def save(self, d):
        d.to_csv(f'./output/{"".join(self.kw)}{self.norm_str()}.csv')
Example #6
0
    def get_google_trends(self,
                          kw_list,
                          trdays=250,
                          overlap=100,
                          cat=0,
                          geo='',
                          tz=360,
                          gprop='',
                          hl='en-US',
                          sleeptime=1,
                          isPartial_col=False,
                          from_start=False,
                          scale_cols=True):
        """Retrieve daily google trends data for a list of search terms

        Parameters
        ----------
        kw_list : list of search terms (max 5)- see pyTrends for more details
        trdays : the number of days to pull data for in a search
            (the max is around 270, though the website seems to indicate 90)
        overlap : the number of overlapped days when stitching two searches together
        cat : category to narrow results - see pyTrends for more details
        geo : two letter country abbreviation (e.g 'US', 'UK')
            default is '', which returns global results - see pyTrends for more details
        tz : timezone offset
            (default is 360, which corresponds to US CST - see pyTrends for more details)
        grop : filter results to specific google property
            available options are 'images', 'news', 'youtube' or 'froogle'
            default is '', which refers to web searches - see pyTrends for more details
        hl : language (e.g. 'en-US' (default), 'es') - see pyTrends for more details
        sleeptime : when stiching multiple searches, this sets the period between each
        isPartial_col : remove the isPartial column
            (default is True i.e. column is removed)
        from_start : when stitching multiple results, this determines whether searches
            are combined going forward or backwards in time
            (default is False, meaning searches are stitched with the most recent first)
        scale_cols : google trend searches traditionally returns scores between 0 and 100
            stitching could produce values greater than 100
            by setting this to True (default), the values will range between 0 and 100

        Returns
        -------
        pandas Dataframe

        Notes
        -----
        This method is essentially a highly restricted wrapper for the pytrends package
        Any issues/questions related to its use would probably be more likely resolved
        by consulting the pytrends github page
        https://github.com/GeneralMills/pytrends
        """

        if len(kw_list) > 5 or len(kw_list) == 0:
            raise ValueError("The keyword list can contain at most 5 words")
        if trdays > 270:
            raise ValueError("trdays must not exceed 270")
        if overlap >= trdays:
            raise ValueError("Overlap can't exceed search days")
        stich_overlap = trdays - overlap
        from_date = datetime.datetime.strptime(self.from_date, '%Y-%m-%d')
        to_date = datetime.datetime.strptime(self.to_date, '%Y-%m-%d')
        n_days = (to_date - from_date).days
        # launch pytrends request
        _pytrends = TrendReq(hl=hl, tz=tz)
        # get the dates for each search
        if n_days <= trdays:
            trend_dates = [' '.join([self.from_date, self.to_date])]
        else:
            trend_dates = [
                '{} {}'.format(
                    (to_date -
                     datetime.timedelta(i + trdays)).strftime("%Y-%m-%d"),
                    (to_date - datetime.timedelta(i)).strftime("%Y-%m-%d"))
                for i in range(0, n_days - trdays +
                               stich_overlap, stich_overlap)
            ]
        if from_start:
            trend_dates = trend_dates[::-1]
        try:
            _pytrends.build_payload(kw_list,
                                    cat=cat,
                                    timeframe=trend_dates[0],
                                    geo=geo,
                                    gprop=gprop)
        except:
            raise
        output = _pytrends.interest_over_time().reset_index()
        if len(output) == 0:
            raise ValueError(
                'search term returned no results (insufficient data)')
        for date in trend_dates[1:]:
            time.sleep(sleeptime)
            try:
                _pytrends.build_payload(kw_list,
                                        cat=cat,
                                        timeframe=date,
                                        geo=geo,
                                        gprop=gprop)
            except:
                raise
            temp_trend = _pytrends.interest_over_time().reset_index()
            temp_trend = temp_trend.merge(output, on="date", how="left")
            # it's ugly but we'll exploit the common column names
            # and then rename the underscore containing column names
            for kw in kw_list:
                norm_factor = np.ma.masked_invalid(
                    temp_trend[kw + '_y'] / temp_trend[kw + '_x']).mean()
                temp_trend[kw] = temp_trend[kw + '_x'] * norm_factor
            temp_trend = temp_trend[temp_trend.isnull().any(axis=1)]
            temp_trend['isPartial'] = temp_trend['isPartial_x']
            output = pd.concat(
                [output, temp_trend[['date', 'isPartial'] + kw_list]],
                axis=0,
                sort=False)

        # reorder columns in alphabetical order
        output = output[['date', 'isPartial'] + kw_list]

        if not isPartial_col:
            output = output.drop('isPartial', axis=1)
        output = output[output['date'] >= self.from_date]
        if scale_cols:
            # the values in each column are relative to other columns
            # so we need to get the maximum value across the search columns
            max_val = float(output[kw_list].values.max())
            for col in kw_list:
                output[col] = 100.0 * output[col] / max_val
        output = output.sort_values(
            'date', ascending=self.ascending).reset_index(drop=True)
        return output
Example #7
0
"""

# The keyword list collection can take up to 5 keywords.
kw_list = ["covid"]

# The Payload - (Requester).



# GET DATA FOR INTEREST OVERTIME

# Define the parameters for the payload requester for 30-days.
pytrends.build_payload(kw_list, cat=0, timeframe='today 1-m', geo='GB', gprop='')

# Execute the payload request.
pd_iot_thirty_days = pytrends.interest_over_time()

# Remove index.
pd_iot_thirty_days.reset_index(inplace=True)

# Rename API Columns.
pd_iot_thirty_days.rename(columns={'date': 'Date', 'covid': 'Value'}, inplace=True)

# Add additional columns required for the report.
pd_iot_thirty_days["Label"] = pd_iot_thirty_days["Value"]
pd_iot_thirty_days["Range"] = "Last-30-Days"

# Sort pd_iot_thirty_days By Date.
pd_iot_thirty_days.sort_values(by=["Date"], inplace=True, ascending=True)

# Print returned output for the pd_iot_thirty_days payload request.
from pytrends.request import TrendReq
pytrends = TrendReq(hl='ja-JP', tz=360)

kw_list = ["Trump"]
pytrends.build_payload(kw_list,
                       cat=0,
                       timeframe='today 1-m',
                       geo='US',
                       gprop='')

result = pytrends.interest_over_time()
# result = pytrends.get_historical_interest(kw_list, year_start=2018, month_start=1, day_start=1, hour_start=0, year_end=2018, month_end=2, day_end=1, hour_end=0, cat=0, geo='', gprop='', sleep=0)
# result = pytrends.interest_by_region(resolution='DMA', inc_low_vol=True, inc_geo_code=False)
# result = pytrends.related_topics()
result.to_csv('data' + "" + '.csv', encoding='utf_8')
Example #9
0
 def test_interest_over_time(self):
     pytrend = TrendReq()
     pytrend.build_payload(kw_list=['pizza', 'bagel'])
     self.assertIsNotNone(pytrend.interest_over_time())
Example #10
0
class myThread(threading.Thread):
    def __init__(self, keyword, timeframe, count):
        threading.Thread.__init__(self)
        # python google trend tool
        self.pytrend = TrendReq()
        # search keyword
        self.keyword = keyword
        # start time
        self.timeframe = timeframe
        # thread count
        self.count = count
        print("Thread " + str(self.count) + " " + self.timeframe +
              " is starting.")

    def run(self):
        # create keyword list
        keyword_list = []
        keyword_list.append(self.keyword)

        # random select sleep time
        sleep_time = random.randint(5, 10)
        # time sleep between each request
        with sema:
            interest = self.download(keyword_list)
            time.sleep(sleep_time)

        # global result
        global result
        try:
            result_lock.acquire()
            # add interest to result list
            result.append(interest)
            print("Thread " + str(self.count) + " " + self.timeframe +
                  " is finished.")
        finally:
            result_lock.release()

    '''
    input: search keyword list: string list
    output: interest dataframe
    exception: connection abortion
    '''

    def download(self, keyword_list):
        try:
            # call google trend api
            self.pytrend.build_payload(kw_list=keyword_list,
                                       cat=0,
                                       timeframe=self.timeframe,
                                       geo='',
                                       gprop='')

        except (Exception, OSError, requests.ConnectionError) as e:
            print("Thread " + str(self.count) + " " + self.timeframe +
                  " goes wrong.")
            print(e)
            print("Retry downloading " + "Thread " + str(self.count) + " " +
                  self.timeframe)
            # random select sleep time
            sleep_time = random.randint(10, 30)
            time.sleep(sleep_time)
            interest = self.download(keyword_list)
            return interest
        else:
            # get the dataframe of interest
            interest = self.pytrend.interest_over_time()
            return interest
        finally:
            pass
Example #11
0
def tweet_text(image_path):

    img = cv2.imread(image_path)

    try:
        
        string = pytesseract.image_to_string(img)

    except TypeError:

        return 'imgur format'
    
    stop_words = set(stopwords.words('english')) 

    print(string)

    word_tokens = word_tokenize(string) 

    filtered_sentence = [] 

    for w in word_tokens: 
	    if w not in stop_words: 
		    filtered_sentence.append(w) 

    filtered_sentence = filtered_sentence[3:]
    print(filtered_sentence)

    with open('1-1000.txt','r') as fin:
        lines = fin.readlines()

    common = []

    for line in lines:
        common.append(line.rstrip('\n'))

    
    filtered_sentence = [s for s in filtered_sentence if not s.lower() in common]     

    
    filtered_sentence = [x for x in filtered_sentence if len(x) > 2 and '\n' not in x and 
                '.' not in x and ',' not in x and 'Retweets' not in x and 
                'Likes' not in x and 'iPhone' not in x and 'Twitter' not in x and 
                '/' not in x and 'Retweeted' not in x and '|' not in x and
                '©' not in x and '>' not in x and 'Comments' not in x and
                ':' not in x and '-' not in x and 'ing' not in x]
    
    
    filtered_sentence = list(dict.fromkeys(filtered_sentence))
    print(filtered_sentence)
    

    trends_values_ca = []
    trends_values_ny = []

    for items in filtered_sentence:

        temp = []
        temp.append(items)

        if len(items) > 0:

            pytrend_ca = TrendReq(hl='en-US', tz=360)
            pytrend_ny = TrendReq(hl='en-US', tz=360)

            pytrend_ca.build_payload(kw_list=list(temp), timeframe='now 1-d', geo='US-CA')
            pytrend_ny.build_payload(kw_list=list(temp), timeframe='now 1-d', geo='US-NY') 

            df_ca = pytrend_ca.interest_over_time()
            df_ny = pytrend_ny.interest_over_time()


            try:

                trends_values_ca.append(sum(list(df_ca[items])[-20:])/(len(df_ca.index)-20))

            except KeyError:

                trends_values_ca.append(0)

            try:

                trends_values_ny.append(sum(list(df_ny[items])[-20:])/(len(df_ca.index)-20))

            except KeyError:

                trends_values_ny.append(0)

    filtered_sentence = sorted((list(zip(filtered_sentence, 
                                list(map(lambda x, y: (x*0.82+y*0.18)/2, trends_values_ca, trends_values_ny))))),
                                key=itemgetter(1), reverse=True)

    print(filtered_sentence)


    if len(filtered_sentence) > 3:
        print("#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0] + " #" + filtered_sentence[3][0])
        return "#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0] + " #" + filtered_sentence[3][0]

    elif len(filtered_sentence) == 3:
        print("#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0])
        return "#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0]

    elif len(filtered_sentence) == 2:
        print("#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0])
        return "#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0]

    elif len(filtered_sentence) == 1:
        print("#" + filtered_sentence[0][0])
        return "#" + filtered_sentence[0][0]

    else:
        return ""
Example #12
0
        kw_list.remove('2017')
    else:
        time_start = str(df.post_published[i].date() - timedelta(days=7))
        time_stop = str(df.post_published[i].date() + timedelta(days=7))

        timeframe = time_start + ' ' + time_stop

        pytrends = TrendReq(hl='en-US', tz=360)

        pytrends.build_payload(kw_list,
                               cat=0,
                               timeframe=timeframe,
                               geo='',
                               gprop='')

        trends = pytrends.interest_over_time()
        if trends.shape[0] < 15:
            kw_list = kw_list[0].split()
            pytrends.build_payload(kw_list,
                                   cat=0,
                                   timeframe=timeframe,
                                   geo='',
                                   gprop='')
            trends = pytrends.interest_over_time()
        trends.sum(axis=1)

        df_trends = df_trends.append(
            pd.DataFrame(data=[
                np.append([df.post_published[i], df.tags[i]],
                          trends.sum(axis=1).values)
            ],
Example #13
0
class Trender:
    '''
    class that keeps track of terms
    knows how to plot a term
    '''
    def __init__(self, terms, daterange, locality):
        self.terms = []
        self.update = True
        self.pytrend = TrendReq()
        self.daterange = daterange
        self.locality = locality
        if daterange == None:
            daterange = "today 12-m"
        if locality == None:
            locality == "DK"
        for term in terms:
            self.terms.append(
                TermTrend(
                    term, self._getTrend([term], self.daterange,
                                         self.locality)))

    def plot(self, plotout):
        df = self.combinedtrend.copy().reset_index()
        x = list(df.columns)[0]
        ys = list(df.columns)[1:]
        plot = interactive_plot(df, x, ys)
        #print("Saving plot...")
        #if plotout == None:
        #    plotout = "plot.png"
        #plot.write_image(plotout)

    def update_trends(self):
        self.update = True
        self.trends()

    def trends(self):
        if self.update:
            #returns trend
            term_faceoff = []
            combinations_list = getCombinations(self.terms)
            tmp_combined_trend = dict()
            for comb in combinations_list:
                combined_trend = self._getTrend([comb[0].term, comb[1].term],
                                                self.daterange, self.locality)
                term = faceoff(comb[0], comb[1], combined_trend)
                tmp_combined_trend[(comb[0], comb[1])] = combined_trend
                term_faceoff.append(term)
            self.update = False
            term_faceoff = list(Counter(term_faceoff))

            #create dataframe
            self.combinedtrend = pd.DataFrame(
                index=term_faceoff[0].trend.index.copy(),
                data={term_faceoff[0].term: term_faceoff[0].trend.iloc[:, 0]})
            scalefactor_list = []
            for i in range(len(term_faceoff) - 1):
                combined_trend = None
                if ((term_faceoff[i],
                     term_faceoff[i + 1])) in tmp_combined_trend.keys():
                    combined_trend = tmp_combined_trend[(term_faceoff[i],
                                                         term_faceoff[i + 1])]
                else:
                    combined_trend = tmp_combined_trend[(term_faceoff[i + 1],
                                                         term_faceoff[i])]
                scalefactor = getScaleFactor(combined_trend,
                                             term_faceoff[i + 1])
                scalefactor_list.append(scalefactor)
                scaled_trend = term_faceoff[i + 1].trend.iloc[:, 0]
                print(scaled_trend.head())
                for scale in scalefactor_list:
                    scaled_trend = scaled_trend.apply(
                        lambda x: x / 100 * scale)
                print(scaled_trend.head())
                self.combinedtrend[term_faceoff[i + 1].term] = scaled_trend
                print(self.combinedtrend.head())
        else:
            return self.combinedtrend

    def _getTrend(self, payload, daterange, locality):
        payload_built = False

        while not payload_built:
            try:
                self.pytrend.build_payload(kw_list=payload,
                                           timeframe=daterange,
                                           geo=locality)
                payload_built = True
            except:
                print("Rebuilding payload...")
                time.sleep(random.randint(1, 4))
                pass

        results = None
        tries = 0
        print("Fetching trend:", payload)
        while results is None:
            try:
                results = self.pytrend.interest_over_time()
                results.to_csv("out.csv")
            except:
                print("Retrying...")
                tries += 1
                time.sleep(random.randint(1, 4))
                if tries >= 5:
                    print("Error. Could not connect to server instance.")
                else:
                    pass
        return results
Example #14
0
 def test_ispartial_dtype_timeframe_all(self):
     pytrend = TrendReq()
     pytrend.build_payload(kw_list=['pizza', 'bagel'],
                           timeframe='all')
     df = pytrend.interest_over_time()
     assert ptypes.is_bool_dtype(df.isPartial)
def download_ght_by_states_today(kw_list):
    pytrend = TrendReq()

    states = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MT",
              "NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","MD","MA","MI","MN","MS","MO","PA","RI","SC","SD",
              "TN", "TX","UT","VT","VA","WA","WV","WI","WY"]

    def state_ind_to_name():
        state_dict = {'US-AL': 'Alabama',
         'US-AK': 'Alaska',
         'US-AZ': 'Arizona',
         'US-AR': 'Arkansas',
         'US-CA': 'California',
         'US-CO': 'Colorado',
         'US-CT': 'Connecticut',
         'US-DE': 'Delaware',
         'US-DC': 'District of Columbia',
         'US-FL': 'Florida',
         'US-GA': 'Georgia',
         'US-HI': 'Hawaii',
         'US-ID': 'Idaho',
         'US-IL': 'Illinois',
         'US-IN': 'Indiana',
         'US-IA': 'Iowa',
         'US-KS': 'Kansas',
         'US-KY': 'Kentucky',
         'US-LA': 'Louisiana',
         'US-ME': 'Maine',
         'US-MD': 'Maryland',
         'US-MA': 'Massachusetts',
         'US-MI': 'Michigan',
         'US-MN': 'Minnesota',
         'US-MS': 'Mississippi',
         'US-MO': 'Missouri',
         'US-MT': 'Montana',
         'US-NE': 'Nebraska',
         'US-NV': 'Nevada',
         'US-NH': 'New Hampshire',
         'US-NJ': 'New Jersey',
         'US-NM': 'New Mexico',
         'US-NY': 'New York',
         'US-NC': 'North Carolina',
         'US-ND': 'North Dakota',
         'US-OH': 'Ohio',
         'US-OK': 'Oklahoma',
         'US-OR': 'Oregon',
         'US-PA': 'Pennsylvania',
         'US-RI': 'Rhode Island',
         'US-SC': 'South Carolina',
         'US-SD': 'South Dakota',
         'US-TN': 'Tennessee',
         'US-TX': 'Texas',
         'US-UT': 'Utah',
         'US-VT': 'Vermont',
         'US-VA': 'Virginia',
         'US-WA': 'Washington',
         'US-WV': 'West Virginia',
         'US-WI': 'Wisconsin',
         'US-WY': 'Wyoming'}
        return state_dict

    i=0
    df=pd.DataFrame()
    st_name = state_ind_to_name()
    
    for st_ind in states:
        pytrend.build_payload(kw_list, geo='US-'+st_ind,timeframe='today 5-y')
        print('region processed US-'+st_ind)
        if i==0:
            df1 = pytrend.interest_over_time()
            df1 = df1.assign(state=st_name['US-'+st_ind])
            df = df1
        else:
            df1 = pytrend.interest_over_time()
            df1 = df1.assign(state=st_name['US-'+st_ind])
            df = df.append(df1)
        i+=1
#             df = pytrend.interest_over_time()

#             df = df.append(df)
        
        filename = filepath + 'ght_state-'+ str(epi.Week.thisweek().year) + "{:02d}".format(epi.Week.thisweek().week)+'.csv'
        df.to_csv(filename)
    return df
Example #16
0
#must have pytrends installed
from pytrends.request import TrendReq

pytrends = TrendReq(hl='en-US', tz=360)

print "debug 1"

pytrends.build_payload(kw_list=['coinbase'], timeframe='now 1-H')

print "debug 2"

testdata = pytrends.interest_over_time()

print "debug 3"

print testdata
Example #17
0
        df = pickle.load(f)
        print('Loaded {} from cache'.format(quandl_id))
    except (OSError, IOError) as e:
        print('Downloading {} from Quandl'.format(quandl_id))
        df = quandl.get(quandl_id, returns="pandas")
        df.to_pickle(cache_path)
        print('Cached {} at {}'.format(quandl_id, cache_path))
    return df


google_trends = TrendReq(hl='en-US', tz=360)
google_trends.build_payload(
    kw_list=[search_term],
    timeframe='today 3-m',
)
df_interest = google_trends.interest_over_time()

df_price = cached_fetch_quantl(quandl_data_id)
'''Get start date and drop previous data'''
start = pd.to_datetime('today') - relativedelta(months=3)
start = start.date()
df_price = df_price.loc[start:]
df_interest = df_interest.loc[start:]

df = pd.concat([
    df_interest[search_term], df_price['Weighted Price'],
    df_price['Volume (BTC)']
],
               axis=1)
'''Plot data'''
fig, ax1 = plt.subplots()
Example #18
0
        if os.path.exists(out_file):
            print(coin + " has been downloaded.")
            continue

        # Run the first time (if we want to start from today, otherwise we need to ask for an end_date as well
        # today = datetime.today().date()
        today = end_date
        old_date = today

        # Go back in time
        new_date = today - timedelta(days=step)
        # Create new timeframe for which we download data
        timeframe = new_date.strftime('%Y-%m-%d') + ' ' + old_date.strftime(
            '%Y-%m-%d')
        pytrend.build_payload(kw_list=kw_list, timeframe=timeframe)
        interest_over_time_df = pytrend.interest_over_time()

        data_flag = 1
        ## RUN ITERATIONS
        while new_date > start_date:

            ### Save the new date from the previous iteration.
            # Overlap == 1 would mean that we start where we
            # stopped on the iteration before, which gives us
            # indeed overlap == 1.
            old_date = new_date + timedelta(days=overlap - 1)

            ### Update the new date to take a step into the past
            # Since the timeframe that we can apply for daily data
            # is limited, we use step = maxstep - overlap instead of
            # maxstep.
Example #19
0
def pullTrends(kw_list, start_date, end_date):
    from pytrends.request import TrendReq

    pytrends = TrendReq(hl='en-US', tz=360)

    term = kw_list[0]

    def toTimeframe(ts1, ts2):
        s1 = f"{ts1:%Y-%m-%d}"
        s2 = f"{ts2:%Y-%m-%d}"
        return "{0} {1}".format(s1, s2)

    def diff_month(d1, d2):
        return (d1.year - d2.year) * 12 + d1.month - d2.month

    def next_month(d):
        if (d.month == 12):
            d = d.replace(year=d.year + 1, month=1)
        else:
            d = d.replace(month=d.month + 1)
        return d

    def last_day_of_month(d):
        d = d.replace(day=1)
        d = next_month(d)

        d += timedelta(days=-1)
        return d

    def first_day_of_month(d):
        return d.replace(day=1)

    def renormalize(df):
        months, daylist = df
        for i in range(months.shape[0]):
            daylist[i][term] = daylist[i][term].apply(
                lambda x: x * months.iloc[i][term])
        return daylist

    def flatten(daylist):
        flattened = daylist[0]
        for i in range(1, len(daylist)):
            flattened = flattened.append(daylist[i])
        return flattened

    start_date = pd.to_datetime(start_date)
    st_date = start_date
    end_date = pd.to_datetime(end_date)
    n_months = diff_month(end_date, start_date)

    # pull months

    # hacky fix dont change
    if n_months < 63:
        start_date_tmp = start_date.replace(year=start_date.year -
                                            int((64 - n_months + 11) / 12))
    else:
        start_date_tmp = start_date
    # to string
    start_date_str = f"{start_date_tmp:%Y-%m-%d}"
    end_date_str = f"{end_date:%Y-%m-%d}"

    #get monthly
    pytrends.build_payload(kw_list,
                           cat=0,
                           timeframe="{0} {1}".format(start_date_str,
                                                      end_date_str),
                           geo='US',
                           gprop='')
    monthly = pytrends.interest_over_time()
    monthly = monthly[monthly.index > st_date]

    #get daily
    start_date = first_day_of_month(st_date)
    tmp_end_date = start_date
    tmp_end_date = last_day_of_month(start_date)

    daylist = []

    for i in range(n_months):

        pytrends.build_payload(kw_list,
                               cat=0,
                               timeframe=toTimeframe(start_date, tmp_end_date),
                               geo='US',
                               gprop='')
        daily = pytrends.interest_over_time()

        daylist.append(daily)

        start_date = next_month(start_date)
        tmp_end_date = last_day_of_month(start_date)

    return flatten(renormalize((monthly, daylist))).drop(['isPartial'], axis=1)
Example #20
0
_rolling_dates = [
    ' '.join(
        map(lambda x: x.strftime(_date_fmt),
            [_tmp_range[i], _tmp_range[i + 1]]))
    for i in range(len(_tmp_range) - 1)
]

# initialization of the major data frame _df_trends
# _dates will contains our last playload argument
_dates = _rolling_dates[0]
_pytrends.build_payload(_kw_list,
                        cat=_cat,
                        timeframe=_dates,
                        geo=_geo,
                        gprop=_gprop)
_df_trends = _pytrends.interest_over_time()

for _dates in _rolling_dates[1:]:
    # we need to normalize data before concatanation
    _common_date = _dates.split(' ')[0]
    _pytrends.build_payload(_kw_list,
                            cat=_cat,
                            timeframe=_dates,
                            geo=_geo,
                            gprop=_gprop)
    _tmp_df = _pytrends.interest_over_time()
    _multiplication_factor = _df_trends.loc[_common_date] / \
        _tmp_df.loc[_common_date]

    _df_trends = (
        pd.concat([_df_trends, (_tmp_df[1:] * _multiplication_factor)
Example #21
0
def pull_keywords_trend(keywords_list,
                        keyword_short_name,
                        time_frame,
                        geo='US',
                        save_folder=None,
                        relative_to_each_other=True):
    """

    :param keywords_list: up to 5
    :param time_frame: Specific dates, 'YYYY-MM-DD YYYY-MM-DD' example '2016-12-14 2017-01-25'
    :param geo:
    :param save_folder: the path to save, optional
    :param relative_to_each_other:

    # notes on the keywords:
    last time used:
    ['AMGN','CELG','BIIB','GILD','REGN']

    # notes in the timeframe

    - Date to start from

    - Defaults to last 5yrs, 'today 5-y'.

    - Everything 'all'

    - Specific dates, 'YYYY-MM-DD YYYY-MM-DD' example '2016-12-14 2017-01-25'

    - Specific datetimes, 'YYYY-MM-DDTHH YYYY-MM-DDTHH' example '2017-02-06T10 2017-02-12T07'

    Note Time component is based off UTC
    Current Time Minus Time Pattern:

    By Month: 'today #-m' where # is the number of months from that date to pull data for

    For example: 'today 3-m' would get data from today to 3months ago
    NOTE Google uses UTC date as 'today'
    Seems to only work for 1, 2, 3 months only
    Daily: 'now #-d' where # is the number of days from that date to pull data for

    For example: 'now 7-d' would get data from the last week
    Seems to only work for 1, 7 days only
    Hourly: 'now #-H' where # is the number of hours from that date to pull data for

    For example: 'now 1-H' would get data from the last hour
    Seems to only work for 1, 4 hours only

    35 weeks is by day
    50 weeks is by week

    :return: DataFrame
    """
    # , proxies=['https://35.201.123.31:880', ]
    pytrends = TrendReq(hl='en-US',
                        tz=360)  # tz is time zone offset in minutes
    if relative_to_each_other:
        pytrends.build_payload(keywords_list,
                               cat=0,
                               timeframe=time_frame,
                               geo=geo,
                               gprop='')
        interest_over_time_df = pytrends.interest_over_time()
    else:
        print(
            'means that the keywords will be pulled one by one.. with each one has a 100, in the period'
        )
        interest_over_time_df = pd.DataFrame()
        for keyword in keywords_list:
            pytrends.build_payload([keyword],
                                   cat=0,
                                   timeframe=time_frame,
                                   geo=geo,
                                   gprop='')
            if len(interest_over_time_df) == 0:
                interest_over_time_df = pytrends.interest_over_time()
            else:
                interest_over_time_df[keyword] = pytrends.interest_over_time(
                )[keyword]

    # now lets check if it is by week or by day
    a_delta = interest_over_time_df.index[1] - interest_over_time_df.index[0]
    if a_delta.days > 1:
        by_day = False
        print('This trend is by week')
    else:
        by_day = True
        print('This trend is by day')

    # now lets get the file name for the pulled trend
    file_name = f'{keyword_short_name}_{relative_to_each_other}_{"by_day" if by_day else "by_week"}.csv'
    print(f'trend file name is {file_name}')
    if save_folder:
        interest_over_time_df.to_csv(os.path.join(save_folder, file_name))
    print(f'Total lines: {len(interest_over_time_df)}')
    print(interest_over_time_df)
    return interest_over_time_df
Example #22
0
def graph(request):
    words = request.GET.get('q')
    if not words:
        return redirect('tot:index')
    date = request.GET.get('y')
    a = ','
    pytrends = TrendReq()
    if a not in words:
        if request.GET.get('y') == 'year':
            date = 'today 12-m'
        elif request.GET.get('y') == 'month':
            date = 'today 1-m'
        else:
            date = 'now 7-d'
        word1 = words
        list1 = word1
        print(f'{date}')
        pytrends.build_payload(list1,
                               cat=0,
                               timeframe=f'{date}',
                               geo='',
                               gprop='')
        print('2222222222222222')
        value = pytrends.interest_over_time()
        del value['isPartial']
        value = value.reset_index()
        value2 = value.to_json(force_ascii=False,
                               orient='split',
                               date_format='iso',
                               date_unit='s')

        abc = json.loads(value2)
        ab = []
        cd = []
        for a in abc['data']:
            k = {}
            h = datetime.strptime(a[0], '%Y-%m-%dT%H:%M:%SZ')
            h2 = h.strftime('%Y-%m-%d %H:%M:%S')
            k['label'] = h2
            k['y'] = a[1]
            k['link'] = '/anal'
            ab.append(k)

            context = {'ab': ab, 'word1': word1}
        return render(request, 'tot/graph.html', context)
    elif a in words:
        pass
        words = words.split(',')
        word1 = words[0]
        word2 = words[1]
        if request.GET.get('y') == 'year':
            date = 'today 12-m'
        elif request.GET.get('y') == 'month':
            date = 'today 1-m'
        else:
            date = 'now 7-d'
        list1 = [word1, word2]
        pytrends.build_payload(list1,
                               cat=0,
                               timeframe=f'{date}',
                               geo='',
                               gprop='')
        value = pytrends.interest_over_time()
        del value['isPartial']
        value = value.reset_index()
        value2 = value.to_json(force_ascii=False,
                               orient='split',
                               date_format='iso',
                               date_unit='s')
        abc = json.loads(value2)
        ab = []
        cd = []
        for a in abc['data']:
            k = {}
            z = {}
            h = datetime.strptime(a[0], '%Y-%m-%dT%H:%M:%SZ')
            h2 = h.strftime('%Y-%m-%d %H:%M:%S')
            k['label'] = h2
            k['y'] = a[1]
            k['link'] = '/anal'
            ab.append(k)
            z['label'] = h2
            z['y'] = a[2]
            z['link'] = '/anal'
            cd.append(z)
            context = {'ab': ab, 'cd': cd, 'word1': word1, 'word2': word2}
        return render(request, 'tot/graph.html', context)
databegin = list(map(formatter, range(0, 19, 3)))
dataend = list(map(formatter, range(4, 25, 3)))

for w in range(306, 373):
    daysprior = w  #related to 24/06/2018 - so starting is 21/06/2018
    daysbefore = datetime.date.today() - datetime.timedelta(days=daysprior)

    keywords = ["blockchain"]

    for i in range(0, len(databegin)):
        begin = daysbefore.strftime("%Y-%m-%d") + "T" + databegin[i]
        end = daysbefore.strftime("%Y-%m-%d") + "T" + dataend[i]

        timeframestring = begin + " " + end

        for j in range(0, len(keywords)):
            pytrend.build_payload(kw_list=[keywords[j]],
                                  timeframe=timeframestring)
            df = pytrend.interest_over_time()
            df.to_csv("../data/" + keywords[j] + "/" + timeframestring +
                      ".csv")

    begin = daysbefore.strftime("%Y-%m-%d") + "T21"
    end = (datetime.date.today() -
           datetime.timedelta(days=daysprior - 1)).strftime("%Y-%m-%d") + "T01"
    timeframestring = begin + " " + end

    for j in range(0, len(keywords)):
        pytrend.build_payload(kw_list=[keywords[j]], timeframe=timeframestring)
        df = pytrend.interest_over_time()
        df.to_csv("../data/" + keywords[j] + "/" + timeframestring + ".csv")
Example #24
0
from pytrends.request import TrendReq
pytrend = TrendReq(hl='de', tz=390, retries=10, backoff_factor=0.5)

keywords = ['foo', 'bar', 'dummy']

for keyword in keywords:
    try:
        pytrend.build_payload(kw_list=[keyword], geo='DE', timeframe='now 1-d')
        gbl = globals()
        for i in range(len(keywords)):
            gbl['df_' + str(i)] = pytrend.interest_over_time()
            gbl['df_' + str(i)] = gbl['df_' + str(i)].drop(
                labels=['isPartial'], axis='columns')
        print(keyword + ' was succesfully pulled from Google Trends')
    except Exception as e:
        print(keyword +
              ' was not successfully pulled because of the following error: ' +
              str(e))
        continue
Example #25
0
class GTrends:
    def __init__(self, encoding, tz, timeout_connect, timeout_read, retries,
                 backoff_factor, geo, dbx):
        #initialize google trends connector
        self.pytrends = TrendReq(hl=encoding,
                                 tz=tz,
                                 timeout=(timeout_connect, timeout_read),
                                 retries=retries,
                                 backoff_factor=backoff_factor)
        self.geo = geo
        self.dbx = dbx

    def set_log(self, log):
        self.log = log

    #download file for ticker, category_name and time frame
    #if file already exists, do not download again
    def download_file(self,
                      ticker,
                      category_name,
                      frame,
                      local_path,
                      monthly=False):
        try:
            self.pytrends.build_payload([ticker],
                                        cat=category_name,
                                        timeframe=frame,
                                        geo=self.geo,
                                        gprop='')
            self.log.info(
                'Downloading data for ticker %s, category %s, frame %s' %
                (ticker, category_name, frame))
            data = self.pytrends.interest_over_time()
            df = pd.DataFrame(data)
            if (df.shape[0] == 0):
                self.log.info(
                    'Empty file for ticker %s, category %s, frame %s' %
                    (ticker, category_name, frame))
                df.to_csv(
                    local_path, index=False
                )  #save an empty file so next time do not make the request
                return True
            if 'isPartial' in df.columns:
                df.drop('isPartial', axis=1, inplace=True)
                df.reset_index(level=0, inplace=True)
            df['ticker'] = ticker
            if (monthly):
                df['date'] = pd.to_datetime(df['date'])
                df['date'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))
            df.to_csv(local_path, index=False)
            return True
        except Exception as ex:
            self.log.error(
                'There has been an error downloading tracker %s, category %s, frame %s ex:%s\n%s',
                ticker, category_name, frame, type(ex), ex)
            return False

    #for each ticker, download data monthly and daily for each category
    def import_data(self, tickers_path, year_from, year_until, categories,
                    data_folder_monthly, data_folder_daily,
                    data_folder_monthly_dropbox, data_folder_daily_dropbox):
        lines = open(tickers_path).readlines()[1:]
        year_range = range(int(year_from), int(year_until))
        download_all = True
        for ticker in lines:
            ticker = ticker.rstrip('\n')
            self.log.info('Process ticker %s' % (ticker))
            for category in categories:
                category_type = category.split(':')
                category_name = category_type[0]
                category_type = category_type[1]
                if (category_type == 'monthly'):
                    dropbox_path = data_folder_monthly_dropbox + category_name
                    if (not self.dbx.folder_exists(dropbox_path)):
                        self.log.info('Create folder ' + dropbox_path)
                        self.dbx.create_folder(dropbox_path)

                    #download monthly data for all the year ranges
                    file_name = ticker + '_' + category_name + '_monthly.csv'
                    if (self.dbx.file_exists(dropbox_path, file_name)):
                        continue

                    frame = year_from + '-01-01 ' + year_until + '-12-31'
                    download_all = self.download_file(
                        ticker, category_name, frame,
                        data_folder_monthly + file_name, True)
                    if (download_all):
                        files_manager.upload_file(
                            data_folder_monthly + file_name,
                            dropbox_path + '/' + file_name, self.dbx)
                    else:
                        break
                else:
                    dropbox_path = data_folder_daily_dropbox + category_name
                    if (not self.dbx.folder_exists(dropbox_path)):
                        self.dbx.create_folder(dropbox_path)

                    for year in year_range:
                        #download first daily file for year
                        file_name = ticker + '_' + category_name + '_1_daily.csv'
                        if (self.dbx.file_exists(dropbox_path, file_name)):
                            continue

                        frame = str(year) + '-01-01 ' + str(year) + '-06-30'
                        download_all = self.download_file(
                            ticker, category_name, frame,
                            data_folder_daily + file_name)

                        if (download_all):
                            files_manager.upload_file(
                                data_folder_daily + file_name,
                                dropbox_path + '/' + file_name, self.dbx)
                        else:
                            break

                        #download second daily file for year
                        file_name = ticker + '_' + category_name + '_2_daily.csv'
                        if (self.dbx.file_exists(dropbox_path, file_name)):
                            continue
                        frame = str(year) + '-07-01 ' + str(year) + '-12-31'
                        download_all = self.download_file(
                            ticker, category_name, frame,
                            data_folder_daily + file_name)

                        if (download_all):
                            files_manager.upload_file(
                                data_folder_daily + file_name,
                                dropbox_path + '/' + file_name, self.dbx)
                        else:
                            break

            if (not download_all):
                break

        return download_all
Example #26
0
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from pytrends.request import TrendReq

sns.set_style("whitegrid")

pytrend = TrendReq()

"""#Jogos Eletronicos, Buscador"""

pytrend.build_payload(kw_list=['Jogos Eletrônicos'], geo='BR', timeframe='all')

jogosEle = pytrend.interest_over_time()

jogosEle.head()

jogosEle.tail()

jogosEle.shape

jogosEle['2020-01-01': '2020-10-01']

plt.figure(figsize=(20,8))
plt.plot(jogosEle['Jogos Eletrônicos'])
plt.grid(True)
plt.title("Número de Pesquisas por Ano")
plt.ylabel("Número de Pesquisas")
plt.xlabel("Data")
Example #27
0
dates_list = [
    '2014-01-01 2014-09-01',
    '2014-09-01 2015-05-01',
    '2015-05-01 2016-01-01',
    '2016-01-01 2016-09-01',
    '2016-09-01 2017-05-01',
    '2017-05-01 2018-01-01',
    '2018-01-01 2018-04-08',
]

kw_list = ["Blockchain", "Bitcoin", "Etherium", "crypto", "cryptocurrency"]

trends_df = []

for d in dates_list:
    pytrends = TrendReq(hl='en-US', tz=360)
    pytrends.build_payload(kw_list, cat=0, timeframe=d, geo='', gprop='')
    trends_df.append(pytrends.interest_over_time())
    time.sleep(2)

trends_df = pd.DataFrame().append(other=trends_df)
trends_df = trends_df.loc[~trends_df.index.duplicated()]

trends_df.to_csv("google_trends_stats.csv")

trends_df['2017-06':'2018-03'].Bitcoin.plot(kind='line',
                                            figsize=(12, 6),
                                            title='Google Trends',
                                            style='b-',
                                            use_index=True)
Example #28
0
from pytrends.request import TrendReq
import matplotlib.pyplot as plt
import pandas as pd
import os

# 검색 keyword, 검색 지역, 검색 기간 입력
keyword1 = "신라면"
keyword2 = "진라면"
local_area = "KR"
period = "today 5-y"

# Google Trend 접속 및 데이터 탑재
trend_obj = TrendReq()
trend_obj.build_payload(kw_list=[keyword1, keyword2], timeframe=period, geo=local_area)
trend_df = trend_obj.interest_over_time()

# matplotlib 한글 폰트 오류 문제 해결
from matplotlib import font_manager, rc
cwd = os.getcwd()
font_path = os.path.join(cwd, "data","malgun.ttf")   #폰트파일의 위치
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

# 그래프 출력
plt.style.use("ggplot")
plt.figure(figsize=(14,5))
trend_df[keyword1].plot()
trend_df[keyword2].plot()
plt.title("Google Trends: %s vs. %s" % (keyword1, keyword2), size=15)
plt.legend(loc="best")
def getSearchesOverTime(searchQuery, startAndEndDates):
    pytrend = TrendReq(GOOGLE_USERNAME,
                       GOOGLE_PASSWORD,
                       custom_useragent="get google trends data script")
    pytrend.build_payload(kw_list=[searchQuery], timeframe=startAndEndDates)
    return pytrend.interest_over_time()
def download_ght_by_country_today(kw_list):
    pytrend = TrendReq()
    pytrend.build_payload(kw_list, geo='US',timeframe='today 5-y')
    
    df = pytrend.interest_over_time()
    return df
Example #31
0
class KeywordOverTimeTrends:
    '''KeywordOverTimeTrends
    This class library will try to dissect time information for a given keyword.
    By extracting time series data from google trend for a particular keyword,
    we wish to learn about the time characteristics of this keyword and to see
    if there is any seasonal or trends with such keyword.
    '''

    def __init__(self):
        plt.switch_backend('Agg')
        self.pytrends = TrendReq()
        self.keyword = None
        self.df = None 
        self.smodel = None
        self.sresult = None
        self.amodel = None
        self.aresult = None

    def start_search(self, keyword):
        self.keyword = keyword
        log_msg = 'starting keyword search for:' + self.keyword
        logger.info(log_msg)
        self.get_pytrend_data()
        logger.info('cleaning dataframe...')
        self.clean_df()
        logger.info('building SARIMA...')
        self.build_SARIMA()
        # logger.info('building ARIMA...')
        # self.build_ARIMA()
        logger.info('finish kot models!')

    def get_pytrend_data(self):
        self.pytrends.build_payload(kw_list=[self.keyword])
        self.df = self.pytrends.interest_over_time()

    def get_test_data(self, n=5):
        return self.df.sample(n)

    def clean_df(self):
        self.df.drop(['isPartial'], axis=1, inplace=True)
        self.df.index.freq = 'W'
        # Use the above instead of resampling it, which takes time.
        # self.df = self.df.resample('W').mean()

    def show_time_series_plot(self):
        self.df.plot(figsize=GRAPH_PARAMS_FIGSIZE, linewidth=GRAPH_PARAMS_LINEWIDTH, fontsize=GRAPH_PARAMS_FONTSIZE)
        plt.xlabel('Year', fontsize=GRAPH_PARAMS_FONTSIZE)
        return plt

    def show_time_series_plot_in_html(self):
        return plt_to_html(self.show_time_series_plot())

    # we will just use decomposition plots
    def show_rolling_average_plot(self):
        self.df.rolling(ROLLING_AVG).mean().plot(figsize=GRAPH_PARAMS_FIGSIZE, linewidth=GRAPH_PARAMS_LINEWIDTH, fontsize=GRAPH_PARAMS_FONTSIZE)
        plt.xlabel('Year', fontsize=GRAPH_PARAMS_FONTSIZE)
        plt.legend(['Rolling Avg 52wks'])
        return plt

    def show_rolling_average_plot_in_html(self):
        return plt_to_html(self.show_rolling_average_plot())
    
    # we will just use decomposition plots
    def show_first_order_diff_plot(self):
        self.df.diff().plot(figsize=GRAPH_PARAMS_FIGSIZE, linewidth=GRAPH_PARAMS_LINEWIDTH, fontsize=GRAPH_PARAMS_FONTSIZE)
        plt.xlabel('Year', fontsize=GRAPH_PARAMS_FONTSIZE)
        return plt
    
    def show_first_order_diff_plot_in_html(self):
        return plt_to_html(self.show_first_order_diff_plot())

    # we will just use decomposition plots
    def show_autocorrelation_plot(self):
        return pd.plotting.autocorrelation_plot(self.df)
    
    def show_autocorrelation_plot_in_html(self):
        return plt_to_html(self.show_autocorrelation_plot())
    
    def show_decomposition_plot(self):
        rcParams['figure.figsize'] = 12, 10
        decomposition = sm.tsa.seasonal_decompose(self.df, model='additive')
        decomposition.plot()
        return plt
    
    def show_decomposition_plot_in_html(self):
        return plt_to_html(self.show_decomposition_plot())

    def build_SARIMA(self):
        self.smodel = sm.tsa.statespace.SARIMAX(self.df,
                                order=SARIMA_ORDER,
                                seasonal_order=SARIMA_SEASONAL_ORDER,
                                enforce_stationarity=False,
                                enforce_invertibility=False)
        self.sresult = self.smodel.fit()
        # print(results.summary().tables[1])

    def show_SARIMA_diagnostics_plot(self):
        self.sresult.plot_diagnostics(figsize=GRAPH_PARAMS_FIGSIZE)
        return plt

    def show_SARIMA_diagnostics_plot_in_html(self):
        return plt_to_html(self.show_SARIMA_diagnostics_plot())

    def show_SARIMA_prediction_plot(self):
        pred_uc = self.sresult.get_forecast(steps=SARIMA_PREDICTION_STEPS)
        pred_ci = pred_uc.conf_int()
        ax = self.df.plot(label='Observed', figsize=GRAPH_PARAMS_FIGSIZE)
        pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
        ax.fill_between(pred_ci.index,
                        pred_ci.iloc[:, 0],
                        pred_ci.iloc[:, 1], color='k', alpha=.25)
        ax.set_xlabel('Year')
        ax.set_ylabel(self.keyword)
        plt.legend()
        #plt.show()
        return plt
    
    def show_SARIMA_prediction_plot_in_html(self):
        return plt_to_html(self.show_SARIMA_prediction_plot())

    def show_all_plots_in_html(self):
        a = self.show_time_series_plot_in_html()
        b = self.show_rolling_average_plot_in_html()
        c = self.show_decomposition_plot_in_html()
        d = self.show_SARIMA_diagnostics_plot_in_html()
        e = self.show_SARIMA_prediction_plot_in_html()
        return {"tseries": a, "rolling": b, "decompose": c, "s-diag": d, "s-pred": e}

    def clear_all_plots(self):
        logger.info('all plots cleared!')
        plt.close('all')
Example #32
0
Utilizes Google Analytics database to pull traffic volume of a given keyword within a given timeframe.
"""

import pandas as pd
from pytrends.request import TrendReq
from pandas.io.json._normalize import nested_to_record
from pandas import DataFrame
import matplotlib
import datetime as dt

#list of keywords, if list is too long it might return an error 400 (seems like 5 keywords or less works)
keywords = [
    'cbdoil', 'cannabis', 'patchouli', 'ylang ylang oil', 'lavender oil'
]

#requests the keyword search volume from the last three months
pytrend = TrendReq(hl='en-US', tz=360)
pytrend.build_payload(kw_list=keywords, cat=0, timeframe='today 3-m', gprop='')
data = pytrend.interest_over_time()
data = data.drop(labels=['isPartial'], axis='columns')

#creates visual plot of traffic over time, remove triple quotes to create the image file
"""
image = data.plot(title = 'Traffic from the Last 3 Months')
fig = image.get_figure()
fig.savefig('traffic.png')
"""

#outputs the search results to a csv file
data.to_csv('test.csv', sep=';', encoding='utf_8_sig', header=True)
Example #33
0
    pass
pass

print("Targets: \nFirst Date: " + target_date_1 + "\nSecond Date: " + target_date_2 + "\nCountry: " + target_country +
      "\nState: " + target_state + "\nCounty Code: " + target_county + "\n")
print("Loaded keywords: ")
print(keywords_read())

# data requests
while collect_cycle != len(keywords_read()):
    pytrend.build_payload(
        [keywords_read()[collect_cycle], keywords_read()[collect_cycle + 1], keywords_read()[collect_cycle + 2],
         keywords_read()[collect_cycle + 3], keywords_read()[collect_cycle + 4]],
        timeframe = target_date_1 + " " + target_date_2,
        geo = target_country + "-" + target_state + "-" + target_county)
    collection_list.append(pytrend.interest_over_time())
    collect_cycle += 5
pass

# merging dataframes
export_dataframe = pandas.concat(collection_list, axis = 1)

# remove isPartial columns
del export_dataframe["isPartial"]

# exports
if arguments.CSV_out == 1:
    print("Exporting to CSV...")
    export_dataframe.to_csv(str(randint(1, 9999999)) + ".csv")
pass
Example #34
0
 def get_google_trends(self, kw_list, trdays=250, overlap=100, 
                       cat=0, geo='', tz=360, gprop='', hl='en-US',
                       sleeptime=1, isPartial_col=False, 
                       from_start=False, scale_cols=True):
     """Retrieve daily google trends data for a list of search terms
     
     Parameters
     ----------
     kw_list : list of search terms (max 5)- see pyTrends for more details
     trdays : the number of days to pull data for in a search
         (the max is around 270, though the website seems to indicate 90)
     overlap : the number of overlapped days when stitching two searches together
     cat : category to narrow results - see pyTrends for more details
     geo : two letter country abbreviation (e.g 'US', 'UK') 
         default is '', which returns global results - see pyTrends for more details
     tz : timezone offset
         (default is 360, which corresponds to US CST - see pyTrends for more details)
     grop : filter results to specific google property
         available options are 'images', 'news', 'youtube' or 'froogle'
         default is '', which refers to web searches - see pyTrends for more details
     hl : language (e.g. 'en-US' (default), 'es') - see pyTrends for more details
     sleeptime : when stiching multiple searches, this sets the period between each
     isPartial_col : remove the isPartial column 
         (default is True i.e. column is removed)
     from_start : when stitching multiple results, this determines whether searches
         are combined going forward or backwards in time
         (default is False, meaning searches are stitched with the most recent first)
     scale_cols : google trend searches traditionally returns scores between 0 and 100
         stitching could produce values greater than 100
         by setting this to True (default), the values will range between 0 and 100
     
     Returns
     -------
     pandas Dataframe
     
     Notes
     -----
     This method is essentially a highly restricted wrapper for the pytrends package
     Any issues/questions related to its use would probably be more likely resolved
     by consulting the pytrends github page
     https://github.com/GeneralMills/pytrends
     """
     
     if len(kw_list)>5 or len(kw_list)==0:
         raise ValueError("The keyword list can contain at most 5 words")
     if trdays>270:
         raise ValueError("trdays must not exceed 270")
     if overlap>=trdays:
         raise ValueError("Overlap can't exceed search days")
     stich_overlap = trdays - overlap
     from_date = datetime.datetime.strptime(self.from_date, '%Y-%m-%d')
     to_date = datetime.datetime.strptime(self.to_date, '%Y-%m-%d')
     n_days = (to_date - from_date).days
     # launch pytrends request
     _pytrends = TrendReq(hl=hl, tz=tz)
     # get the dates for each search
     if n_days <= trdays:
         trend_dates = [' '.join([self.from_date, self.to_date])]
     else:
         trend_dates = ['{} {}'.format(
         (to_date - datetime.timedelta(i+trdays)).strftime("%Y-%m-%d"),
         (to_date - datetime.timedelta(i)).strftime("%Y-%m-%d")) 
                        for i in range(0,n_days-trdays+stich_overlap,
                                       stich_overlap)]
     if from_start:
         trend_dates = trend_dates[::-1]
     try:
         _pytrends.build_payload(kw_list, cat=cat, timeframe=trend_dates[0], 
                                geo=geo, gprop=gprop)
     except Exception as e:
         return pd.DataFrame({"error":e}, index=[0])
     output = _pytrends.interest_over_time().reset_index()
     if len(output)==0:
         return pd.DataFrame({"error":'search term returned no results (insufficient data)'}, index=[0])
     for date in trend_dates[1:]:
         time.sleep(sleeptime)
         try:
             _pytrends.build_payload(kw_list, cat=cat, timeframe=date, 
                                      geo=geo, gprop=gprop)
         except Exception as e:
             return pd.DataFrame({"error":e}, index=[0])
         temp_trend = _pytrends.interest_over_time().reset_index()
         temp_trend = temp_trend.merge(output, on="date", how="left")
         # it's ugly but we'll exploit the common column names
         # and then rename the underscore containing column names
         for kw in kw_list:
             norm_factor = np.ma.masked_invalid(temp_trend[kw+'_y']/temp_trend[kw+'_x']).mean()
             temp_trend[kw] = temp_trend[kw+'_x'] * norm_factor
         temp_trend =  temp_trend[temp_trend.isnull().any(axis=1)]
         temp_trend['isPartial'] = temp_trend['isPartial_x']
         output = pd.concat([output, temp_trend[['date', 'isPartial'] + kw_list]], axis=0)
     
     # reorder columns in alphabetical order
     output = output[['date', 'isPartial']+kw_list]
     
     if not isPartial_col:
         output = output.drop('isPartial', axis=1)
     output = output[output['date']>=self.from_date]
     if scale_cols:
         # the values in each column are relative to other columns
         # so we need to get the maximum value across the search columns
         max_val = float(output[kw_list].values.max())
         for col in kw_list:
             output[col] = 100.0*output[col]/max_val
     output = output.sort_values('date', ascending=self.ascending).reset_index(drop=True)
     return output
Example #35
0
## FIRST RUN ##

# Login to Google. Only need to run this once, the rest of requests will use the same session.
pytrend = TrendReq()

# Run the first time (if we want to start from today, otherwise we need to ask for an end_date as well
today = datetime.today().date()
old_date = today

# Go back in time
new_date = today - timedelta(days=step)

# Create new timeframe for which we download data
timeframe = new_date.strftime('%Y-%m-%d')+' '+old_date.strftime('%Y-%m-%d')
pytrend.build_payload(kw_list=kw_list, timeframe = timeframe)
interest_over_time_df = pytrend.interest_over_time()

## RUN ITERATIONS

while new_date>start_date:
    
    ### Save the new date from the previous iteration.
    # Overlap == 1 would mean that we start where we
    # stopped on the iteration before, which gives us
    # indeed overlap == 1.
    old_date = new_date + timedelta(days=overlap-1)
    
    ### Update the new date to take a step into the past
    # Since the timeframe that we can apply for daily data
    # is limited, we use step = maxstep - overlap instead of
    # maxstep.