def __init__(self): self.__aData = DataAggregator() self.__carousel = None self.__header = None self.__cart = None self.__hangarCameraLocation = None return
def _execute_checkers(self): ''' Executes the checkers for mis matched data. ''' template_handler = TrueTemplateHandler(self.temp_name) mv = MissingValChecker(self.temp_name) da = DataAggregator(self.temp_name) self.df = mv.execute(self.df, template_handler) new_df = da.execute(self.df, template_handler)
def get_aggregate_data(q_out): all_data = retrieve_data(q_out) aggregator = DataAggregator() return aggregator.parse_data(all_data)
def setUp(self): self.mock_template = MockBTemplate("mock_b_template") self.da = DataAggregator("mock_b_template")
class SummarizeNER(object): def __init__(self, df): self.data = df self.cleaned_data = self.get_cleaned_data() self.cleaned_phrases = self.get_ner_tags() def get_cleaned_data(self): return [self.clean(text) for text in self.data['text']] def get_summarized_data(self): wikidf = pd.DataFrame(columns=("NER", "Summary")) wikidf["NER"] = self.cleaned_phrases wikidf["Summary"] = self.get_wiki_summary() return wikidf def del_repeat(self, seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] def get_wiki_summary(self, sentences=4): wiki_summary = [] for phrase, i in zip(self.cleaned_phrases, range(len(self.cleaned_phrases))): print("Downloading ({}/{}) wikipedia page...".format( i + 1, len(self.cleaned_phrases)), end="\r") try: summary = wikipedia.summary(phrase, sentences=sentences) except Exception as e: try: a = str(e).splitlines()[1] summary = wikipedia.summary(a, sentences=sentences) except: summary = "No wikipedia page found" pass pass wiki_summary.append(summary) return wiki_summary def clean(self, text, url=True, words_only=True, first_n_sent=(False, 4)): if url: text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text) if words_only: regex = re.compile('[^a-zA-Z]') text = regex.sub(' ', text) if first_n_sent[0]: text = re.match(r'(?:[^.:;]+[.:;]){4}', text).group() return text def get_ner_tags(self): sys.path.append('../preprocess') from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokenized_list = [ct.split() for ct in self.cleaned_data] NERTags = st.tag_sents(tokenized_list) n = [] for nt in NERTags: n.extend(nt) ids = [] #get the indexes of all words that have NER tags ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"] a = np.array(ids) consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1) phrases = [] for ci in consecutive_ids: phrase = "" tag = "" for id_ in ci: phrase += "{} ".format(n[id_][0]) tag += "{}".format(n[id_][1]) phrases.append(phrase) cleaned_phrases = self.del_repeat(phrases) return cleaned_phrases if __name__ == '__main__': data_helper = DataAggregator() date_range = [date.today().strftime('%Y-%m-%d')] # Only today. df = data_helper.get_data(date_range=date_range) sn = SummarizeNER(df) sd = sn.get_summarized_data() print(sd.endode("UTF-8"))
def top_mentions(self, X): h = [] for tweet in self.tweets: user_name = tweet.user.screen_name usermentions = [ usermentions.get('screen_name') for usermentions in tweet.entities.get("user_mentions") ] if len(usermentions) > 0: h.extend(usermentions) else: continue c = Counter(h) return c.most_common(X) if __name__ == '__main__': data_helper = DataAggregator() date_range = [date.today().strftime('%Y-%m-%d')] # Only today. df = data_helper.get_data(date_range=date_range) tweet_stats = TwitterStatistics(df) tdf = tweet_stats.get_data() try: print(tdf.to_string()) except: sys.stdout.buffer.write(df.to_string().encode('utf-8'))
class SummarizeNER(object): def __init__(self, df): self.data = df self.cleaned_data = self.get_cleaned_data() self.cleaned_phrases = self.get_ner_tags() def get_cleaned_data(self): return [self.clean(text) for text in self.data['text']] def get_summarized_data(self): self.data['NER'] = self.cleaned_phrases self.data['Wiki-NER-Sumarry'] = self.get_wiki_summary() return self.data def del_repeat(self, seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] def get_wiki_summary(self, sentences=4): wiki_summary = [] for phrase, i in zip(self.cleaned_phrases, range(len(self.cleaned_phrases))): if phrase != 'N/A': print("Downloading wikipedia pages...".format( i + 1, len(self.cleaned_phrases)), end="\r") try: summary = wikipedia.summary(phrase[0], sentences=sentences) except Exception as e: try: a = str(e).splitlines()[1] summary = wikipedia.summary(a, sentences=sentences) except: summary = "No wikipedia page found" pass pass else: summary = "No wikipedia page found" wiki_summary.append(summary) return wiki_summary def clean(self, text, url=True, words_only=True, first_n_sent=(False, 4)): if url: text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text) if words_only: regex = re.compile('[^a-zA-Z]') text = regex.sub(' ', text) if first_n_sent[0]: text = re.match(r'(?:[^.:;]+[.:;]){4}', text).group() return text def get_ner_tags(self): sys.path.append('../preprocess') from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokenized_list = [ct.split() for ct in self.cleaned_data] NERTags = st.tag_sents(tokenized_list) tags = [nt for nt in NERTags] ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"] for t in tags] phrases = [] for i, t in zip(ids, tags): phrase = "" tt = "N/A" for p, index in zip(i, range(len(i))): if index == len(i) - 1: phrase += "{}".format(t[p][0]) tt = phrase, t[p][1] else: phrase += "{} ".format(t[p][0]) phrases.append(tt) return phrases if __name__ == '__main__': data_helper = DataAggregator() date_range = [date.today().strftime('%Y-%m-%d')] # Only today. df = data_helper.get_data(date_range=date_range) sn = SummarizeNER(df) sd = sn.get_summarized_data() print(sd.endode("UTF-8"))
class GoogleDataEnhancer(object): def __init__(self, df): self.data = self.get_data(df) self.domains = self.get_domains() self.results = self.google_search() def get_data(self, df): a = df[df['source'] == "twitter"].index.tolist() tweet = [df["raw_data"][index].text for index in a] tweets = [self.clean(t) for t in tweet] a = df[df['source'] == "reddit"].index.tolist() subs = [df["raw_data"][index].title for index in a] data = tweets + subs return data def get_domains(self): with open("../domains.json", "r") as f: domains = json.load(f) return domains def clean(self, text): URLless_text = re.sub( r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text) regex = re.compile('[^a-zA-Z]') cleaned_text = regex.sub(' ', URLless_text) return cleaned_text def in_domain(self, url): for d in self.domains: for urls in self.domains[d]: if urls in url: return d else: continue return "" def google_search(self): results = [] gd = GoogleDataHelper() print("* Google Searching Data...") for d, i in zip(self.data, range(len(self.data))): try: d = self.clean(d) print("* * Downloading ({}/{}) query".format( i + 1, len(self.data))) r = gd.get_data(querystring=d) except Exception as e: print("* * cannot download query ({}) because: ({})".format( i, str(e))) r = pd.DataFrame() continue results.append(r) sleep(5) #minimum time to not look like a bot/script print("* Download complete! ") return results def enhance(self): df = pd.DataFrame(columns=(list(self.domains.keys()))) df["data"] = self.data df['results'] = self.results for r, i in zip(self.results, range(len(self.results))): for d in self.domains: df[d][i] = [] types = [] type_dict = {} for url, text in zip(r['author'], r['text']): _type = self.in_domain(url) if _type != "": t = (url, text) df[_type][i].append(t) return df #def wiki_summarize(self): #import wikipedia #self.data if __name__ == '__main__': data_helper = DataAggregator() date_range = [date.today().strftime('%Y-%m-%d')] # Only today. df = data_helper.get_data(date_range=date_range) gde = GoogleDataEnhancer(df) print(gde.enhance())