def _get_plenaries(dico, dico_nl, document): document.plenaries = [] for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))): pl = DocumentPlenary() pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1] pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1] pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1]) pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1]) pl.agenda = [] if dico[key].get("Calendrier"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) pl.incident = [] if dico[key].get("Incident"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) pl.save() document.plenaries.append(pl)
def parse_house_cosponsors(self, bill, cell): # if there's only one sponsor, we don't have to worry about this. if (not cell.a.nextSibling or not cell.a.nextSibling.nextSibling or not 'href' in cell.a.nextSibling.nextSibling): cosponsor_dirty = cell.a.em.contents[0] cosponsor = clean_text(cosponsor_dirty) bill.add_sponsor('cosponsor', cosponsor, sponsor_link=cell.a['href']) else: # there are several sponsors, and we have to go to the bill text bill_text_url = cell.a.nextSibling.nextSibling['href'] try: doc = self.urlopen(bill_text_url) # people between (Sponsor) and (Co-Sponsor) are the cosponsors m = re.search(r"\(Sponsor\),?(.*)\(Co", doc, re.DOTALL) if m: cosponsor_list = clean_text(m.group(1)) cosponsor_list = re.split(" ?(?:,| AND ) ?", cosponsor_list) for cosponsor_dirty in cosponsor_list: cosponsor = clean_text(cosponsor_dirty) bill.add_sponsor('cosponsor', cosponsor) except urllib2.HTTPError as e: if e.code == 404: # Some of the bill text pages are broken, but the # rest of the bill metadata is valid so just # log the error and move on self.log('404 on %s, continuing' % bill_text_url) else: raise e
def _get_in_charged_commissions(dico, dico_nl, document): document.in_charge_commissions = [] for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))): icc = InChargeCommissions() icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1] icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1] icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1]) icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1]) if dico[key].get("Rapporteur"): # FIXME link to actual deputies icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t")) icc.incident = [] if dico[key].get("Incident"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) icc.agenda = [] if dico[key].get("Calendrier"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) if dico[key].get("Rapport"): icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])} icc.save() document.in_charge_commissions.append(icc)
def _get_document_chambre(dico, dico_nl, document): if not dico.get("Document Chambre"): return chambre_dico = dico['Document Chambre'] chambre_dico_nl = dico_nl['Document Kamer'] document_chambre = DocumentChambre() document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt') document_chambre.type["fr"] = chambre_dico[u'Type de document'].text document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération') document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution') document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi') document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin') document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut') document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status') document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ') document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ') _get_authors(chambre_dico, chambre_dico_nl, document_chambre) url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre) if chambre_dico.get(u'Document(s) joint(s)/lié(s)'): document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'], chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)] document_chambre.save() document.document_chambre = document_chambre
def parse_cosponsors_from_bill(self, bill, url): bill_page = self.urlopen(url) bill_page = lxml.html.fromstring(bill_page) sponsors_text = find_nodes_with_matching_text( bill_page, '//p/span', r'\s*INTRODUCED.*') if len(sponsors_text) == 0: # probably its withdrawn return sponsors_text = sponsors_text[0].text_content() sponsors = clean_text(sponsors_text).split(',') # if there are several comma separated entries, list them. if len(sponsors) > 1: # the sponsor and the cosponsor were already got from the previous # page, so ignore those: sponsors = sponsors[2::] for part in sponsors: parts = re.split(r' (?i)and ', part) for sponsor in parts: cosponsor_name = clean_text(sponsor) if cosponsor_name != "": cosponsor_name = cosponsor_name.replace( u'\u00a0', " ") # epic hax for name in re.split(r'\s+AND\s+', cosponsor_name): # for name in cosponsor_name.split("AND"): name = name.strip() if name: bill.add_sponsor('cosponsor', name)
def add_text(status): """ This shorts the text to 140 characters for displaying it in the list control.""" message = "" if status.has_key("copy_history"): txt = status["copy_history"][0]["text"] else: txt = status["text"] if len(txt) < 140: message = utils.clean_text(txt) else: message = utils.clean_text(txt[:139]) return message
def parse_cosponsors_from_bill(self, bill, url): with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*') if len(sponsors_text) == 0: # probably its withdrawn return sponsors_text = sponsors_text[0].text_content() sponsors = clean_text(sponsors_text).split(',') if len(sponsors) > 1: # if there are several comma separated entries, list them. # the sponsor and the cosponsor were already got from the previous page, so ignore those: sponsors = sponsors[2::] for part in sponsors: parts = re.split(r' (?i)and ',part) for sponsor in parts: bill.add_sponsor('cosponsor', clean_text(sponsor))
def _build_sub_section(i, dico): sub_section = clean_text(i.td.b.text) if dico.get(sub_section): raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section])) dico[sub_section] = AccessControlDict() dico[sub_section]["head"] = i('td')[1] return sub_section
def tag_tokens(self, tokens, no_repeats=False): """ Runs the SRL process on the given tokens. :param tokens: a list of tokens (as strings) :param no_repeats: whether to prevent repeated argument labels :returns: a list of lists (one list for each sentence). Sentences have tuples (all_tokens, predicate, arg_structure), where arg_structure is a dictionary mapping argument labels to the words it includes. """ tokens_obj = [attributes.Token(utils.clean_text(t, False)) for t in tokens] converted_bound = np.array([self.boundary_reader.converter.convert(t) for t in tokens_obj]) converted_class = np.array([self.classify_reader.converter.convert(t) for t in tokens_obj]) pred_positions = self.find_predicates(tokens_obj) # first, argument boundary detection # the answer includes all predicates answers = self.boundary_nn.tag_sentence(converted_bound, pred_positions) boundaries = [[self.boundary_itd[x] for x in pred_answer] for pred_answer in answers] arg_limits = [utils.boundaries_to_arg_limits(pred_boundaries) for pred_boundaries in boundaries] # now, argument classification answers = self.classify_nn.tag_sentence(converted_class, pred_positions, arg_limits, allow_repeats=not no_repeats) arguments = [[self.classify_itd[x] for x in pred_answer] for pred_answer in answers] structures = _group_arguments(tokens, pred_positions, boundaries, arguments) return SRLAnnotatedSentence(tokens, structures)
def df_transform(self, terms): self.df[pd.isnull(self.df['Comment'])] = "" self.df = self.df.drop_duplicates('Comment') self.df['date'] = self.df['date'].apply(lambda x : unix_convert(x)) self.df['Comment'] = self.df['Comment'].apply(lambda x: clean_text(str(x))) self.df['Sentiment_raw'] = self.df.apply(lambda row: sentiment(row['Comment']), axis = 1) self.df['Sentiment'] = self.df.apply(lambda row: sentiment_new(row['Comment'], terms), axis = 1) self.df['State'] = self.df.apply(lambda row: state_label(str(row['Locations'])), axis = 1) self.df = pd.merge(self.df, self.longlat, how='left', on='State')
def parse_stations(self, html): bs = BeautifulSoup(html) tables = bs.findAll('table', {'class':'show_fw'}) st = {} for i in range(2): trs = tables[i].findAll('tr') direction = clean_text(trs[0].text.replace('Fahrtrichtung', '')) sta = [] for tr in trs[2:-1]: if tr.a: sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href'])) else: sta.append((clean_text(tr.text), None)) st[direction] = sta return st
def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre): if chambre_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentChambrePdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.distribution_date = d[1]('td')[-1].text for dep, dep_nl in zip(d[2:], d_nl[2:]): if dep.a: lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0] deputy = Deputy.objects.get(lachambre_id=lachambre_id) doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) else: doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_chambre.other_pdfs.append(doc)
def _build_first_level(i, dico): key = clean_text(i.td.text) # we can get severals Moniter erratum if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'): if not dico.get(key): dico[key] = [] dico[key].append(i('td')[1]) else: if dico.get(key): raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key])) dico[key] = i('td')[1]
def _build_pdf_sub_section(i, dico, sub_section): key = clean_text(i.td.text) # we can have a list on joined documents if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'): if not dico[sub_section].get(key): dico[sub_section][key] = [] dico[sub_section][key].append(i('td')[1]) elif dico[sub_section].get(key): raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key])) else: dico[sub_section][key] = i('td')[1]
def _get_competences(dico, dico_nl, document): # FIXME: meh, DRY if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"): document.timeline = [] for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]], [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]): logger.debug("append time line %s %s %s" % (_date, _title, _title_nl)) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date)) elif dico.get(u"Compétence"): document.timeline = [] for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]: logger.debug("append time line %s %s %s" % (_date, _title, "")) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date)) elif dico_nl.get(u"Bevoegdheid"): document.timeline = [] for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]: logger.debug("append time line %s %s %s" % (_date, "", _title_nl)) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date)) if dico.get("Analyse des interventions"): document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
def get_document_features(self,document): ''' Extract features from the document. Current supported features are the existence of a word in the document :param document: a dictionary with 'text' key and 'tags' key. ''' document = clean_text(document) document_words = set(document.split()) features = {} for word in self.get_word_features(): features['contains(%s)' % word] = (word in document_words) return features
def document_pdf_part_cutter(soup): result = [] blob = [soup('tr')[0]] for i in soup('tr')[1:]: if not clean_text(i.text): continue if not i.img or not i.img.get("class") or i.img["class"] != "picto": blob.append(i) else: result.append(blob) blob = [i] result.append(blob) return result
def hk_freq(data_dir, hk_dir): print("hk freq") data = get_json_data(data_dir) at = AutoTag() for entry in data: entry["text"] = clean_text(entry["text"]) if not os.path.isdir(hk_dir): os.mkdir(hk_dir) with open(hk_dir + "total", "w") as f: pass word_count = at.count_data([w for entry in data for w in entry["text"].split()], hk_dir + "total") words = [w.encode("utf-8") for w, c in word_count if c > 40] with open(hk_dir + "freqs.csv", "wb") as csvfile: # data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40] w = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) w.writerow([u"HK"] + words) # csvfile.write(','.join([u'HK']+words) + '\n') hkwords = {} data_json = get_json(data_dir) for json_entry in data_json: if json_entry["model"] != "facebook_feeds.facebook_feed": continue name = json_entry["fields"]["name"] print(name) if not name: continue name = name.encode("utf-8") word_count = at.count_data( [w for entry in data for w in entry["text"].split() if entry["feed"] == json_entry["pk"]], hk_dir + name ) word_dict = {w.encode("utf-8"): c for w, c in word_count} hkwords[name] = [] for word in words: if word not in word_dict: hkwords[name].append(str(0)) else: hkwords[name].append(str(word_dict[word])) with open(hk_dir + "freqs.csv", "a") as csvfile: writer = csv.writer(csvfile, delimiter=",") # writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([name] + hkwords[name]) with open(hk_dir + "freqs_t.csv", "a") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for name in hkwords: writer.writerow([name] + hkwords[name])
def tag_tokens(self, tokens): """ Tags a given list of tokens. Tokens should be produced with the nlpnet tokenizer in order to match the entries in the vocabulary. If you have non-tokenized text, use POSTagger.tag(text). :param tokens: a list of strings :returns: a list of strings (the tags) """ converter = self.reader.converter converted_tokens = np.array([converter.convert(utils.clean_text(token, False)) for token in tokens]) answer = self.nn.tag_sentence(converted_tokens) tags = [self.itd[tag] for tag in answer] return tags
def compile_episode_transcript(trans_id, db): """ Uses the Audiosearch database to compiles a transcript for the podcast episode associated with trans_id. Parameters ---------- trans_id : int The Audiosearch transcript ID for a particular podcast episode as found using find_episode_transcript_ids db : database connection The connection to the Audiosearch Postgres database Returns ------- transcript : np.array of shape (n, 4) An array containing the transcript for the podcast episode associated with trans_id. Each row corresponds to a line in the transcript, and the columns correspond to [start_time, end_time, utterance, speaker_id] """ transcript = [] trans = get_transcript(db, trans_id).sort_values(by="start_time") # line contents: [start_time, end_time, utterance, speaker_id] for idx in range(trans.shape[0]): speaker = trans['speaker_id'][idx] text = clean_text(trans['text'][idx]) start = trans['start_time'][idx]/60. end = trans['end_time'][idx]/60. if speaker is None or np.isnan(speaker): speaker = -1 # this happens a lot in the audiosearch db.. if text == '.': continue line = [start, end, text, speaker] # skip duplicate lines if idx > 0 and line[2] == transcript[-1][2]: continue transcript.append(line) return np.asarray(transcript)
def hk_freq(data_dir, hk_dir): print('hk freq') data = get_json_data(data_dir) at = AutoTag() for entry in data: entry['text'] = clean_text(entry['text']) if not os.path.isdir(hk_dir): os.mkdir(hk_dir) with open(hk_dir+'total', 'w') as f: pass word_count = at.count_data([w for entry in data for w in entry['text'].split()],hk_dir+'total') words = [w.encode('utf-8') for w,c in word_count if c > 40] with open(hk_dir+'freqs.csv', 'wb') as csvfile: # data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40] w = csv.writer(csvfile, delimiter = ',', quotechar='"', quoting=csv.QUOTE_MINIMAL) w.writerow([u'HK']+words) # csvfile.write(','.join([u'HK']+words) + '\n') hkwords = {} data_json = get_json(data_dir) for json_entry in data_json: if json_entry['model'] != "facebook_feeds.facebook_feed": continue name = json_entry['fields']['name'] print(name) if not name: continue name = name.encode('utf-8') word_count = at.count_data([w for entry in data for w in entry['text'].split() if entry["feed"] == json_entry['pk']],hk_dir+name) word_dict = {w.encode('utf-8'):c for w,c in word_count} hkwords[name] = [] for word in words: if word not in word_dict: hkwords[name].append(str(0)) else: hkwords[name].append(str(word_dict[word])) with open(hk_dir+'freqs.csv', 'a') as csvfile: writer = csv.writer(csvfile, delimiter=',') # writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([name]+hkwords[name]) with open(hk_dir+'freqs_t.csv', 'a') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for name in hkwords: writer.writerow([name]+hkwords[name])
def parse_senate_cosponsors(self, bill, url): bill.add_source(url) with self.soup_context(url) as cosponsors_page: # cosponsors are all in a table cosponsor_table = cosponsors_page.find(id="dgCoSponsors") cosponsors = cosponsor_table.findAll("tr") for cosponsor_row in cosponsors: # cosponsors include district, so parse that out cosponsor_string = cosponsor_row.font.contents[0] cosponsor = clean_text(cosponsor_string) # they give us a link to the congressperson, so we might # as well keep it. cosponsor_url = cosponsor_row.a.href bill.add_sponsor('cosponsor', cosponsor, sponsor_link=cosponsor_url)
def parse_senate_cosponsors(self, bill, url): bill.add_source(url) with self.urlopen(url) as cosponsors_page: cosponsors_page = lxml.html.fromstring(cosponsors_page) # cosponsors are all in a table cosponsors = cosponsors_page.xpath('//table[@id="dgCoSponsors"]/tr/td/a') #print "looking for cosponsors = %s" % cosponsors for cosponsor_row in cosponsors: # cosponsors include district, so parse that out cosponsor_string = cosponsor_row.text_content() cosponsor = clean_text(cosponsor_string) cosponsor = cosponsor.split(',')[0] # they give us a link to the congressperson, so we might # as well keep it. cosponsor_url = cosponsor_row.attrib['href'] bill.add_sponsor('cosponsor', cosponsor, sponsor_link=cosponsor_url)
def test_doc(self,document,tags,thresh=0.3): ''' test which tags should tag the given document :param document: an entry with 'text' key. :param tags: tags to test. :param thresh: threshold for the tag probability. :return probs: list of the most probable tags and their probability. ''' document["text"] = clean_text(document["text"]) document["features"] = self.get_document_features(document["text"]) probs = [] for tag in tags: classifier = self.load_classifier(tag) if not classifier: continue prob = classifier.prob_classify(document["features"]) if prob.prob(True) > thresh: probs.append((prob.prob(True),tag)) probs = sorted(probs,reverse=True) return probs
def _get_document_senat(dico, dico_nl, document): if not dico.get(u"Document Sénat"): return senat_dico = dico[u"Document Sénat"] senat_dico_nl = dico_nl[u"Document Senaat"] document_senat = DocumentSenat() document_senat.deposition_date = senat_dico[u"Date de dépôt"].text document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin") document_senat.type["fr"] = senat_dico[u"Type de document"].text document_senat.type["nl"] = senat_dico_nl[u"Document type"].text document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ') document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ') document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)")) document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut') document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status') url, tipe, session = clean_text(str(senat_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) if senat_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentSenatPdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.date = d[0]('td')[-1].contents[0] doc.authors = [] for dep, dep_nl in zip(d[1:], d_nl[1:]): doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_senat.other_pdfs.append(doc) document_senat.save() document.document_senat = document_senat
def _get_first_level_data(dico, dico_nl, document): document.deposition_date = get_text_else_blank(dico, u"Date de dépôt") document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution")) document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet")) if dico.get("Descripteur Eurovoc principal"): document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text if dico.get("Eurovoc-hoofddescriptor"): document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text document.vote_date = get_text_else_blank(dico, "Vote Chambre") document.law_date = get_text_else_blank(dico, "Date de la loi") document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°") document.moniteur_date = get_text_else_blank(dico, u"Date moniteur") document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat") document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature") if dico.get("Etat d'avancement"): document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0]) document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None if dico.get("Stand van zaken"): document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0]) document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None if dico.get("Descripteurs Eurovoc"): document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|")) if dico.get("Eurovoc descriptoren"): document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|")) if dico.get("Candidats-descripteurs Eurovoc"): document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|")) if dico.get("Eurovoc kandidaat-descriptoren"): document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|")) if dico.get(u"Mots-clés libres"): document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|")) if dico.get(u"Vrije trefwoorden"): document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|")) if dico.get("Documents principaux"): document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents)) if dico.get("Hoodfdocumenten"): document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))
def buildTextBlocks(self, filterPos=None, days_to_avoid=[]): MIN_TOKENS = 10 THRESHOLD = 0.4 textBlocksPerDay = {} blockVectorsPerDay = {} for date, listMsgs in self.iChat.conversation.iteritems(): #print date if date in days_to_avoid: continue textBlocksPerDay[date] = [] blockVectorsPerDay[date] = [] listMsgs = self.iChat.conversation[date] #print "Num msg",len(listMsgs) lastVector = None acumTokens = 0 acumText = [] textBlocks = [] blockVector = None blockVectors = [] for idx, dictMsg in enumerate(listMsgs): text = dictMsg["text"] if not filterPos: cleanTokens = utils.clean_text(text) else: cleanTokens = utils.clean_text(text, True, filterPos) acumTokens += len(cleanTokens) acumText.extend(cleanTokens) if acumTokens >= MIN_TOKENS: if not blockVector: vector = self.iSQL.getMsgVector(" ".join(acumText)) blockVector = vector else: vector = self.iSQL.getMsgVector(text) if not vector: continue distance = self.iSQL.distance(vector, blockVector) #print distance if distance > THRESHOLD: textBlocks.append(acumText) blockVectors.append(blockVector) blockVector = [] #print acumText acumText = [] acumTokens = 0 else: #print "aggregating" blockVector = self.iSQL.aggregateVectors( blockVector, vector) blockVectorsPerDay[date] = blockVectors textBlocksPerDay[date] = textBlocks #print len(textBlocks) return blockVectorsPerDay, textBlocksPerDay
import os from argparse import ArgumentParser from utils import evaluate, clean_text if __name__ == "__main__": """ python evaluate.py \ --predictions_dir=./predictions/valid_predictions \ --answers_dir=./data/valid_texts """ parser = ArgumentParser() parser.add_argument("--predictions_dir") parser.add_argument("--answers_dir") parser.add_argument("--items_to_display", type=int, default=10) args = parser.parse_args() true_texts = [] pred_texts = [] for file in os.listdir(args.predictions_dir): true_texts.append( clean_text(open(os.path.join(args.answers_dir, file)).readline())) pred_texts.append( open(os.path.join(args.predictions_dir, file)).readline()) evaluate(true_texts=true_texts, pred_texts=pred_texts, top_k=args.items_to_display)
def eval(context, question): with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \ open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \ open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \ open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb: word2idx = pickle.load(wi) char2idx = pickle.load(ci) word_embedding_matrix = pickle.load(wb) char_embedding_matrix = pickle.load(cb) # transform them into Tensors word_embedding_matrix = torch.from_numpy( np.array(word_embedding_matrix)).type(torch.float32) char_embedding_matrix = torch.from_numpy( np.array(char_embedding_matrix)).type(torch.float32) idx2word = dict([(y, x) for x, y in word2idx.items()]) context = clean_text(context) context = [w for w in word_tokenize(context) if w] question = clean_text(question) question = [w for w in word_tokenize(question) if w] if len(context) > config.max_len_context: print("The context is too long. Maximum accepted length is", config.max_len_context, "words.") if max([len(w) for w in context]) > config.max_len_word: print("Some words in the context are longer than", config.max_len_word, "characters.") if len(question) > config.max_len_question: print("The question is too long. Maximum accepted length is", config.max_len_question, "words.") if max([len(w) for w in question]) > config.max_len_word: print("Some words in the question are longer than", config.max_len_word, "characters.") if len(question) < 3: print( "The question is too short. It needs to be at least a three words question." ) context_idx = np.zeros([config.max_len_context], dtype=np.int32) question_idx = np.zeros([config.max_len_question], dtype=np.int32) context_char_idx = np.zeros([config.max_len_context, config.max_len_word], dtype=np.int32) question_char_idx = np.zeros( [config.max_len_question, config.max_len_word], dtype=np.int32) # replace 0 values with word and char IDs for j, word in enumerate(context): if word in word2idx: context_idx[j] = word2idx[word] else: context_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: context_char_idx[j, k] = char2idx[char] else: context_char_idx[j, k] = 1 for j, word in enumerate(question): if word in word2idx: question_idx[j] = word2idx[word] else: question_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: question_char_idx[j, k] = char2idx[char] else: question_char_idx[j, k] = 1 model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=config.hidden_size, drop_prob=config.drop_prob) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: pass print( "Model weights not found, initialized model with random weights.") model.to(device) model.eval() with torch.no_grad(): context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device) pred1, pred2 = model(context_idx, context_char_idx, question_idx, question_char_idx) starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False) prediction = " ".join(context[starts.item():ends.item() + 1]) return prediction
def parse_departures(self, html): bs = BeautifulSoup(html) dep = [] # Check for error messages msg = bs.findAll('span', {'class': 'rot fett'}) if msg and len(msg) > 0 and unicode(msg[0].text).find(u'technischen St') > 0: print '\n'.join(map(lambda x: x.text.replace(' ', ''), msg)) return [] errtable = bs.find('table', {'class':'errortable'}) if errtable and clean_text(errtable.text): print "Errortable found" print errtable.text return [] if bs.table and bs.table.tr: st_td = bs.table.tr.findAll('td') if st_td: station = clean_text(st_td[-1].text) else: print "Unexpected Error: Stationname not found" print "Debug:", st_td.encode('UTF-8') else: print "Unexpected Error: table or tr not found" print bs return [] # zusatztext crap zt = bs.find('td', {'class':'zusatztext'}) if zt: ma = ZUSATZTEXT_REGEX.search(zt.text) if ma: line = ma.group(1) direction = ma.group(2) if direction == direction.upper(): direction = direction.capitalize() tim = int(ma.group(3)) d = Departure(line=line, direction=direction, lowfloor=True, station=station, time=tim) dep.append(d) else: print zt.text table = bs.find('table', {'class':'imagetable'}) if not table: print "table not found" return [] if errtable: print "Warning: Empty errortable found" return dep trs = table.findAll('tr') for tr in trs[1:]: tds = tr.findAll('td') line = clean_text(tds[0].text) direction = clean_text(tds[1].text) if direction.startswith(line): direction = direction.lstrip(line).strip() if direction == direction.upper(): direction = direction.capitalize() lf_img = tds[-1].img lowfloor = lf_img and lf_img.has_key('alt') d = {'line': line, 'direction': direction, 'lowfloor': lowfloor, 'station': station} # parse time tim = clean_text(tds[2].text) dts = DELTATIME_REGEX.search(tim) abs = ABSTIME_REGEX.search(tim) if tim.find(u'...in K\xfcrze') >= 0: d['time'] = 0 elif abs: d['time'] = calc_datetime(abs.group(1)) elif tim.isdigit(): d['time'] = int(tim) elif dts: # is timedelta d['time'] = int(dts.group(1)) else: print "Error parsing time:", tim continue dep.append(Departure(**d)) return dep
def parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = "%s/%s" % (self.senate_base_url, url) with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) bill_id = bill_page.xpath('//*[@class="entry-title"]') if len(bill_id) == 0: self.log("WARNING: bill summary page is blank! (%s)" % url) self.bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath("//table/tr") # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == "Co-Sponsor:": cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == "LR Number:" bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip() == "Governor Action:": lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == "Bill String:" official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self.subjects: subs = self.subjects[bid] self.log("With subjects for this bill") self.log(bid) bill = Bill( session, "lower", bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title, type=bill_type, subjects=subs, ) bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) try: bill_sponsor_link = table_rows[0][1][0].attrib["href"] except IndexError: return if bill_sponsor_link: bill_sponsor_link = "%s%s" % (self.senate_base_url, bill_sponsor_link) bill.add_sponsor("primary", bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors if cosponsorOffset == 1: if len(table_rows[2][1]) == 1: # just a name cosponsor = table_rows[2][1][0] bill.add_sponsor( "cosponsor", cosponsor.text_content(), sponsor_link="%s/%s" % (self.senate_base_url, cosponsor.attrib["href"]), ) else: # name ... etal try: cosponsor = table_rows[2][1][0] bill.add_sponsor( "cosponsor", clean_text(cosponsor.text_content()), sponsor_link="%s/%s" % (self.senate_base_url, cosponsor.attrib["href"]), ) self.parse_cosponsors_from_bill( bill, "%s/%s" % (self.senate_base_url, table_rows[2][1][1].attrib["href"]) ) except scrapelib.HTTPError as e: self.log("WARNING: " + str(e)) self.bad_urls.append(url) self.log("WARNING: no bill summary page (%s)" % url) actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] actions_link = "%s/%s" % (self.senate_base_url, actions_link_tag.attrib["href"]) actions_link = re.sub("content", "print", actions_link) self.parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocsSection"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = "%s%s" % (self.senate_base_url, doc_tag[0].attrib["href"]) bill.add_document(doc, text_url, mimetype="text/html") # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocsSection"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) text_url = "%s%s" % (self.senate_base_url, version_tag[0].attrib["href"]) pdf_url = "%s%s" % (self.senate_base_url, version_tag[1].attrib["href"]) bill.add_version(version, text_url, pdf_url=pdf_url, on_duplicate="use_new") self.save_bill(bill)
def retrieve(dataset, index, filename=None): if index >= len(dataset): st.error(f"Index {index} exceeds dataset length.") eval_dataset = None if filename: # TODO Handle this through dedicated fields if "cnn_dailymail" in filename: eval_dataset = "cnndm" elif "xsum" in filename: eval_dataset = "xsum" data = dataset[index] id_ = data.get('id', '') try: document = data['spacy:document'] except KeyError: if not is_lg: st.error( "'en_core_web_lg model' is required unless loading from cached file." "To install: 'python -m spacy download en_core_web_lg'") try: text = data['document'] except KeyError: text = data['article'] if not text: st.error("Document is blank") return document = nlp(text if args.no_clean else clean_text(text)) document._.name = "Document" document._.column = "document" try: reference = data['spacy:summary:reference'] except KeyError: if not is_lg: st.error( "'en_core_web_lg model' is required unless loading from cached file." "To install: 'python -m spacy download en_core_web_lg'") try: text = data['summary'] if 'summary' in data else data[ 'summary:reference'] except KeyError: text = data.get('highlights') if text: reference = nlp(text if args.no_clean else clean_text(text)) else: reference = None if reference is not None: reference._.name = "Reference" reference._.column = "summary:reference" model_names = set() for k in data: m = re.match('(preprocessed_)?summary:(?P<model>.*)', k) if m: model_name = m.group('model') if model_name != 'reference': model_names.add(model_name) preds = [] for model_name in model_names: try: pred = data[f"spacy:summary:{model_name}"] except KeyError: if not is_lg: st.error( "'en_core_web_lg model' is required unless loading from cached file." "To install: 'python -m spacy download en_core_web_lg'") text = data[f"summary:{model_name}"] pred = nlp(text if args.no_clean else clean_text(text)) parts = model_name.split("-") primary_sort = 0 if len(parts) == 2: model, train_dataset = parts if train_dataset == eval_dataset: formatted_model_name = model.upper() else: formatted_model_name = f"{model.upper()} ({train_dataset.upper()}-trained)" if train_dataset in ["xsum", "cnndm"]: primary_sort = 1 else: primary_sort = 2 else: formatted_model_name = model_name.upper() pred._.name = formatted_model_name pred._.column = f"summary:{model_name}" preds.append(((primary_sort, formatted_model_name), pred)) preds = [pred for _, pred in sorted(preds)] return Instance( id_=id_, document=document, reference=reference, preds=preds, data=data, )
def get_skills(custom_entities, text): exp = get_head_sections(text)['skills'] try: model_dir = config.skill_model_dir nlp2 = spacy.load(model_dir) doc2 = nlp2(exp) entities = utils.extract_entities(doc2) for key, val in entities.items(): entities[key] = utils.clean_text(val) if len(entities['Skills']) > 4: return entities['Skills'] except: print("No skills entitiy") skills = list() exp = [] lines = [lin.strip() for lin in text.split('\n')] for ind, line in enumerate(lines): if len(line.split()) < 4 and ('skills' in line.lower()): try: for i in range(1, 5): exp.append(lines[ind + i]) except: exp.append(lines[ind + i]) description = [ e for e in exp if not e[:5].lower() == 'level' and len(e) > 0 ] exp = get_head_sections(text)['skills'] if len(exp.split('\n')) < 4: exp = '' for ind, line in enumerate(lines): if len(line.split()) < 8 and ('skills' in line.lower() or 'expertise' in line.lower() or 'strength' in line.lower() or 'proficiency' in line.lower()): try: try: try: for i in range(1, 25): exp += lines[ind + i] + ' ' except: for i in range(1, 15): exp += lines[ind + i] + ' ' except: for i in range(1, 10): exp += lines[ind + i] + ' ' except: exp += lines[ind + 1] + ' ' + lines[ind + 2] exp = exp.lower() skillset = list() skill_dict = {} nlp_text = nlp(exp) noun_chunks = list(nlp_text.noun_chunks) tokens = [token.text for token in nlp_text if not token.is_stop] data = pd.read_csv(config.skill_csv) bigrams = utils.extract_ngrams(exp, 2) skills = list(data.columns.values) custom_skillset = get_custom_skills(text) try: # check for one-grams for token in tokens: if token.lower() in skills: skillset.append(token) # check for bi-grams and tri-grams for token in noun_chunks: token = token.text.lower().strip() if token in skills and token not in skillset: skillset.append(token) for token in bigrams: if token.lower() in skills and token not in skillset: skillset.append(token) skillset = [i.lower() for i in set([i.lower() for i in skillset])] if len(skillset) < 5: for skill in custom_skillset: if skill not in skillset: skillset.append(skill) try: skillset = skillset[:5] except: pass skillset = [i.capitalize() for i in set([i.lower() for i in skillset])] skillset = [i.strip() for i in skillset if not i in (' a ')] skillset = sorted(skillset, key=len, reverse=True) if len(skillset) < 3: skillset = get_mod_skills(text) return skillset except: if len(skillset) < 3: skillset = get_mod_skills(text) return skillset
def parse_house_bill(self, url, session): url = re.sub("content", "print", url) with self.urlopen(url) as bill_page_data: bill_page = BeautifulSoup(bill_page_data) header_table = bill_page.table # get all the info needed to record the bill bill_id = header_table.b.contents[0] bill_id = clean_text(bill_id) bill_desc = header_table.findAll('td')[1].contents[0] bill_desc = clean_text(bill_desc) lr_label_tag = bill_page.find(text=re.compile("LR Number:")) bill_lr = lr_label_tag.next.contents[0].strip() # could substitute the description for the name, # but keeping it separate for now. bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr) bill.add_source(url) # get the sponsors and cosponsors sponsor_dirty = bill_page.em.contents[0] m = re.search("(.*)\(.*\)", sponsor_dirty) if m: bill_sponsor = m.group(1) else: bill_sponsor = sponsor_dirty # find the table with bill details...it'll be useful later bill_details_tbl = bill_page.table.nextSibling.nextSibling bill_sponsor_link = None if bill_details_tbl.a: bill_sponsor_link = bill_details_tbl.a['href'] bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors cosponsor_cell = bill_details_tbl.find( text=re.compile("CoSponsor")).next if cosponsor_cell.a: self.parse_house_cosponsors(bill, cosponsor_cell) # parse out all the actions actions_link_tag = bill_page.find('a', text='ACTIONS').previous.previous actions_link = actions_link_tag['href'] actions_link = re.sub("content", "print", actions_link) self.parse_house_actions(bill, actions_link) # get bill versions version_tags = bill_page.findAll(href=re.compile("biltxt")) if version_tags: for version_tag in version_tags: if version_tag.b: version = clean_text(version_tag.b.contents[0]) text_url = version_tag['href'] pdf_url = version_tag.previousSibling pdf_url = pdf_url.previousSibling['href'] bill.add_version(version, text_url, pdf_url=pdf_url) self.save_bill(bill)
mlp.train(df[fieldname].values, df[labelname].values) return mlp if __name__ == '__main__': pos_neg_ratios = Counter() reviews = pd.read_csv("reviews.csv", encoding="utf-8") ratings = pd.read_csv("ratings.csv", encoding="utf-8") review_ratings = pd.merge(reviews, ratings) review_ratings["binary_ratings"] = review_ratings["ratings"].apply( lambda x: POSITIVE if x > 3 else NEGATIVE) review_ratings = review_ratings[ review_ratings["unixReviewTime"] > 1403913600] review_ratings = clean_text(review_ratings, "summary") print(review_ratings.shape) mlp = run_network(review_ratings, "summary", "binary_ratings") prediction = mlp.predict("good") print(prediction) prediction = mlp.predict("Bad") print(prediction) # filter_punctuations = lambda text: "".join(list(filter(lambda x: x not in string.punctuation, str(text)))) # # # def clean_text_field(df, fieldname): # df[fieldname] = df[fieldname].str.lower() # df[fieldname] = df[fieldname].apply(filter_punctuations) # df.dropna(subset=[fieldname], inplace=True)
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._senate_base_url, url) bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]') if len(bill_id) == 0: self.log("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.log("With subjects for this bill") self.log(bid) if bill_desc == "": print("ERROR: Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) # XXX: Some pages full of blank bills. return bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title, type=bill_type, subjects=subs) bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) try: bill_sponsor_link = table_rows[0][1][0].attrib['href'] except IndexError: return if bill_sponsor_link: bill_sponsor_link = '%s%s' % (self._senate_base_url, bill_sponsor_link) bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors if cosponsorOffset == 1: if len(table_rows[2][1]) == 1: # just a name cosponsor = table_rows[2][1][0] bill.add_sponsor( 'cosponsor', cosponsor.text_content(), sponsor_link='%s/%s' % (self._senate_base_url, cosponsor.attrib['href'])) else: # name ... etal try: cosponsor = table_rows[2][1][0] bill.add_sponsor( 'cosponsor', clean_text(cosponsor.text_content()), sponsor_link='%s/%s' % (self._senate_base_url, cosponsor.attrib['href'])) sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) except scrapelib.HTTPError as e: self.log("WARNING: " + str(e)) self._bad_urls.append(url) self.log("WARNING: no bill summary page (%s)" % url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._senate_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocsSection"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % (self._senate_base_url, doc_tag[0].attrib['href']) bill.add_document(doc, text_url, mimetype="text/html") # get bill versions version_tags = bill_page.xpath( '//div[@class="BillDocsSection"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version(version, vurl.attrib['href'], on_duplicate='use_new', mimetype=mimetype) self.save_bill(bill)
| Repository | Description | |:-------------|:------------------|""" % (lang, lang, lang_git) print(header) print(header_trendy) fname = "data/%s-github-trendy.json" % lang with open(fname) as f: for line in f: package = json.loads(line) if "description" not in package or "name" not in package or "url" not in package or not package[ "name"] or not package["url"]: continue if package["description"] is None: package["description"] = "" row = "| [%s](%s) | %s |" % (clean_text( package["name"]), package["url"], clean_text( package["description"])) print(row) print(header_packages) fname = "data/%s.json" % lang with open(fname) as f: for line in f: package = json.loads(line) if "description" not in package or "name" not in package or "url" not in package or not package[ "name"] or not package["url"]: continue if package["description"] is None: package["description"] = "" row = "| [%s](%s) | %s |" % (clean_text( package["name"]), package["url"], clean_text(
def re_clean(df): text = df['comment_clean'].tolist() cleaned_text = clean_text(text) df.loc[:,'reviews'] = cleaned_text return df
def cleaning(tweet): return sentence_spell_check(cont.fix(clean_text(tweet)))
def parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self.senate_base_url,url) with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) bill_id = bill_page.xpath('//*[@class="entry-title"]') if len(bill_id) == 0: print "WARNING: bill summary page is blank! (%s)" % url self.bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3+cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4+cosponsorOffset][0].text_content().strip() == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5+cosponsorOffset+lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title) bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) bill_sponsor_link = table_rows[0][1][0].attrib['href'] if bill_sponsor_link: bill_sponsor_link = '%s%s' % (self.senate_base_url,bill_sponsor_link) bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors if cosponsorOffset == 1: if len(table_rows[2][1]) == 1: # just a name cosponsor = table_rows[2][1][0] bill.add_sponsor('cosponsor', cosponsor.text_content(), sponsor_link='%s/%s' % (self.senate_base_url,cosponsor.attrib['href'])) else: # name ... etal try: cosponsor = table_rows[2][1][0] bill.add_sponsor('cosponsor', clean_text(cosponsor.text_content()), sponsor_link='%s/%s' % (self.senate_base_url,cosponsor.attrib['href'])) self.parse_cosponsors_from_bill(bill,'%s/%s' % (self.senate_base_url,table_rows[2][1][1].attrib['href'])) except scrapelib.HTTPError: self.bad_urls.append(url) print "WARNING: no bill summary page (%s)" % url actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] actions_link = '%s/%s' % (self.senate_base_url,actions_link_tag.attrib['href']) actions_link = re.sub("content", "print", actions_link) self.parse_house_actions(bill, actions_link) # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocsSection"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) text_url = '%s%s' % (self.senate_base_url,version_tag[0].attrib['href']) pdf_url = '%s%s' % (self.senate_base_url,version_tag[1].attrib['href']) bill.add_version(version, text_url, pdf_url=pdf_url) self.save_bill(bill)
def __clean_text(self, text): return utils.clean_text(text)
def main(argv): IMAGE_DIRECTORY = '/images' CSV_FILE_PATH = 'data.csv' num_epochs = 250000 BATCH_SIZE = 1 img_shape = (224, 224, 3) # Reduce based on RAM GRAPH_THRESHOLD = 0.5 LEARNING_RATE = 1.6192e-05 LOSS_WEIGHTS = [0.6, 0.2, 0.2] #Give importance to classification, semantic and gap loss respectively. IMAGE_ENCODER = 'resnet50' TEXT_ENCODER = 'bert' try: opts, args = getopt.getopt(argv,"i:t:b:",["image_encoder=","text_encoder=","batch_size="]) except getopt.GetoptError: print('test -i <image_folder> -c <csv_filename>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i", "--image_encoder"): IMAGE_ENCODER = arg elif opt in ("-c", "--text_encoder"): TEXT_ENCODER = arg elif opt in ("-b", "--batch_size"): BATCH_SIZE = int(arg) print("Set batch_size to %d" %BATCH_SIZE) df = pd.read_csv(CSV_FILE_PATH) num_samples = df.shape[0] class_names = df.classes.unique() ## CONVERT TO CATEGORICAL temp = list(df.classes) training_class_intmap = temp.copy() ### map each color to an integer mapping = {} for x in range(len(class_names)): mapping[class_names[x]] = x # integer representation for x in range(df.shape[0]): training_class_intmap[x] = mapping[training_class_intmap[x]] training_classes = tf.keras.utils.to_categorical(training_class_intmap) image_names = df.image text_list = df.text text_list = utils.clean_text(text_list) num_classes = len(class_names) adj_graph_classes = utils.get_adj_graph(class_names) if (IMAGE_ENCODER=='resnet50'): image_embedding_extractor_model = encoder.get_resnet50(img_shape) image_encoder_size = 2048 elif (IMAGE_ENCODER=='resnet101'): image_embedding_extractor_model = encoder.get_resnet101(img_shape) if (TEXT_ENCODER=='bert'): bert_embedding = BertEmbedding() text_encoder_size = 768 complete_model = build_model(image_encoder_size , text_encoder_size, num_classes) train_loss_results = [] # train_accuracy_results = [] optimizer = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE) #Define the optimize and specify the learning rate for epoch in range(num_epochs): epoch_loss_avg = tf.keras.metrics.Mean() # epoch_accuracy = tf.keras.metrics.CategoricalAccuracy() #Uncomment if you want to track # Training loop - using batches of 1024 # encode_and_pack_batch(batch_size, image_encoder, text_encoder, image_names, text_list, training_classes, img_shape): xi1 , xt1,xi2 , xt2, y1, y2 = utils.encode_and_pack_batch(BATCH_SIZE, image_embedding_extractor_model, bert_embedding ,image_names, text_list, training_classes, img_shape) x1 = [xi1, xt1] x2 = [xi2, xt2] # Optimize the model loss_value, grads = grad(complete_model, x1,x2,y1, y2, LOSS_WEIGHTS,GRAPH_THRESHOLD, adj_graph_classes) optimizer.apply_gradients(zip(grads, complete_model.trainable_variables)) # Track progress epoch_loss_avg.update_state(loss_value) # Add current batch loss # End epoch train_loss_results.append(epoch_loss_avg.result()) if epoch % 5 == 0: print("Epoch {:03d}: Loss: {:.3f}".format(epoch,epoch_loss_avg.result()))
def predict(config): input = clean_text(config.input) params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) eos_idx = eng.vocab.stoi['<eos>'] print(eos_idx) print(eng.vocab.itos[eos_idx]) # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(input) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to(params.device) # [1, source_len]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) # [1, max_len] encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): if next_symbol == eos_idx: break target[0][i] = next_symbol print(target[0][i]) decoder_output, _ = model.decoder(target, source, encoder_output) # [1, target length, output dim] prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1] print(prob) next_word = prob.data[i] print(next_word) next_symbol = next_word.item() target[0][10] = 3 print(target.shape) print(target[0][34]) eos_idx = torch.where(target[0] == eos_idx)[0][0] eos_idx = eos_idx.item() print(eos_idx) target = target[0][:eos_idx].unsqueeze(0) # translation_tensor = [target length] filed with word indices target, attention_map = model(source, target) target = target.squeeze(0).max(dim=-1)[1] translated_token = [eng.vocab.itos[token] for token in target] print(translated_token) #translation = translated_token[:translated_token.index('<eos>')] #translation = ''.join(translation) translation = ''.join(translated_token) print(f'question> {config.input}') print(f'reply> {translation}') display_attention(tokenized, translated_token, attention_map[4].squeeze(0)[:-1])
'ids' : torch.tensor(ids, dtype=torch.long), 'mask_ids' : torch.tensor(mask_ids, dtype=torch.long), 'token_type_ids' : torch.tensor(token_type_ids, dtype=torch.long), 'target_start_idx' : torch.tensor(target_start_idx, dtype=torch.long), 'target_end_idx' : torch.tensor(target_end_idx, dtype=torch.long), 'offsets' : torch.tensor(offsets, dtype=torch.long), 'orig_sentiment' : sentiment, 'orig_sele_text' : selected_text, 'orig_text' : text, 'targ_sentiment' : torch.tensor(targ_sentiment, dtype=torch.long) } if __name__ == "__main__": trn_df = pd.read_csv(config.TRAIN_FILE) trn_df['text'] = trn_df['text'].apply(lambda x:clean_text(x)) trn_df['selected_text'] = trn_df['selected_text'].apply(lambda x:clean_text(x)) dataset = TweetDataset(trn_df['text'].values, trn_df['selected_text'].values, trn_df['sentiment'].values, config.TOKENIZER, config.MAX_LEN, config.MODEL_VERSION ) for i in range(len(trn_df)): try: dataset[i] except: print(i)
def get_data_features(max_seq_len, embedding_file, batch_size): """ Args: max_seq_len: Max sequence length of the sentences embedding_file: Embedding file batch_size: Batch size for the DataLoader Output: embedding_dim, word_index, embedding_matrix, X_train, y_train, X_test, y_test """ #Load data train, val, test, features_train, features_val, features_test = load_data_features( ) #Embedding dimension based on the embedding_file embedding_dim = int(re.findall('\d{3,}', embedding_file)[0]) #Clean data X_train = [ clean_text(text, remove_punt_number_special_chars=True, remove_stopwords=True, apply_stemming=False) for text in train["text"] ] X_val = [ clean_text(text, remove_punt_number_special_chars=True, remove_stopwords=True, apply_stemming=False) for text in val["text"] ] X_test = [ clean_text(text, remove_punt_number_special_chars=True, remove_stopwords=True, apply_stemming=False) for text in test["text"] ] y_train = encode_label(train["label"]) y_val = encode_label(val["label"]) y_test = encode_label(test["label"]) tokenizer = Tokenizer(num_words=10000000) tokenizer.fit_on_texts(list(X_train) + list(X_val)) word_index = tokenizer.word_index vocab_size = len(word_index) + 1 #Embeddings embeddings_index = load_glove(embedding_file) embedding_matrix = create_weight_matrix(vocab_size, word_index, embedding_dim, embeddings_index) X_train = tokenizer.texts_to_sequences(X_train) X_train = pad_sequences(X_train, maxlen=max_seq_len) X_val = tokenizer.texts_to_sequences(X_val) X_val = pad_sequences(X_val, maxlen=max_seq_len) X_test = tokenizer.texts_to_sequences(X_test) X_test = pad_sequences(X_test, maxlen=max_seq_len) train_dataloader = get_dataloader_features(X_train, features_train, y_train, batch_size) val_dataloader = get_dataloader_features(X_val, features_val, y_val, batch_size) test_dataloader = get_dataloader_features(X_test, features_test, y_test, batch_size) return embedding_dim, int( vocab_size ), embedding_matrix, train_dataloader, val_dataloader, test_dataloader
directory = '../scraper/' data_files = [] json_file_item = None with open('../artifacts/anon_dict.json') as json_file: json_file_item = json.load(json_file) analysis = "" for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith('.csv') & (True == filename.startswith('results_')): data_files.append(pd.read_csv(directory+filename)) analysis = filename.split('.csv')[0] df = pd.concat(data_files, sort=False) df['clean_text'] = df['text'].map(lambda x: clean_text(x)) df['date'] = df['timestamp'].apply(lambda x: transform_date(x)) df['year'] = df['date'].apply(lambda x: x.year) df = df.loc[df['year'] >= df['year'].max(), ] df['hashtags'] = df['text'].map(lambda x: get_hashtags_operations(x)) terms_attacks = json_file_item["attacks"] df['attack'] = df['clean_text'].map(lambda x: check_attack(x, terms_attacks)) df['operations'] = df['hashtags'].map(lambda x: True if len( [hashtag for hashtag in x if '#op' == hashtag[:3]]) > 0 else False) df['RT'] = df['clean_text'].map(lambda x: True if 'rt' in x else False) # Translate RTs to Attacks
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._senate_base_url,url) bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]') if len(bill_id) == 0: self.log("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3+cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4+cosponsorOffset][0].text_content().strip() == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5+cosponsorOffset+lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.log("With subjects for this bill") self.log(bid) if bill_desc == "": print("ERROR: Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title )) # XXX: Some pages full of blank bills. return bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title, type=bill_type, subjects=subs) bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) try: bill_sponsor_link = table_rows[0][1][0].attrib['href'] except IndexError: return if bill_sponsor_link: bill_sponsor_link = '%s%s' % (self._senate_base_url,bill_sponsor_link) bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors if cosponsorOffset == 1: if len(table_rows[2][1]) == 1: # just a name cosponsor = table_rows[2][1][0] bill.add_sponsor('cosponsor', cosponsor.text_content(), sponsor_link='%s/%s' % ( self._senate_base_url, cosponsor.attrib['href'] )) else: # name ... etal try: cosponsor = table_rows[2][1][0] bill.add_sponsor('cosponsor', clean_text(cosponsor.text_content()), sponsor_link='%s/%s' % ( self._senate_base_url, cosponsor.attrib['href'] )) sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) except scrapelib.HTTPError as e: self.log("WARNING: " + str(e)) self._bad_urls.append(url) self.log( "WARNING: no bill summary page (%s)" % url ) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._senate_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocsSection"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % ( self._senate_base_url, doc_tag[0].attrib['href'] ) bill.add_document(doc, text_url, mimetype="text/html") # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocsSection"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version(version, vurl.attrib['href'], on_duplicate='use_new', mimetype=mimetype) self.save_bill(bill)
audio = r.listen(source) print("Loading vocab2int") vocab2int = pickle.load(open("Mood:Emotion Code/data/vocab2int.pickle", "rb")) model = get_model_emotions(len(vocab2int), sequence_length=sequence_length, embedding_size=embedding_size) model.load_weights("results/model_v1_0.59_0.76.h5") if __name__ == "__main__": import argparse # parser = argparse.ArgumentParser(description="Emotion classifier using text") # parser.add_argument("text", type=str, help="The text you want to analyze") # args = parser.parse_args() text = tokenize_words(clean_text(r.recognize_google(audio)), vocab2int) x = pad_sequences([text], maxlen=sequence_length) prediction = model.predict_classes(x)[0] probs = model.predict(x)[0] # print("hi:",index) print("Question asked: ", Textlist[index]) print("You said: " + r.recognize_google(audio)) print("Probs:") for i, category in categories.items(): print(f"{category.capitalize()}: {probs[i]*100:.2f}%") print("The most dominant emotion:", categories[prediction])
vocab_size = 2000 batch_size = 32 lr = 0.001 epochs = 500 n_chars = 500 # to generate temperature = 0.6 text_file = 'TheHitchhikersGuide.txt' # spm.SentencePieceTrainer.Train(f'--input={text_file} --model_prefix=tokens --vocab_size={vocab_size}') sp = spm.SentencePieceProcessor() sp.Load("tokens.model") text = open(text_file, 'rb').read().decode(encoding='utf-8') text = clean_text(text) text_as_int = np.array(sp.EncodeAsIds(text)) model = TransformerCharLM(vocab=vocab_size, d_model=384, n_heads=6, n_encoder_layers=10, d_ff=2048, dropout=0.1) model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr) try: for epoch in range(1, epochs + 1):
def abs_summ_api(): if not request.json: abort(400) transcript = request.get_json()['entries'] id = request.args.get('id', default=None, type=str) callbackurl = request.args.get('callbackurl', default=None, type=str) enc = request.args.get('enc', default='utf-8', type=str) size = request.args.get('nkeys', default=100, type=int) lang = request.args.get('lang', default='fr', type=str) if id is None: abort(400) try: speakers = [] utterances = [] for item in transcript: speakers.append(item['speaker']) utterances.append(utils.clean_utterance(item['text'], resources[lang]['filler_words'])) utterances_tagged = [ ' '.join(['/'.join(t) for t in sent]) for sent in resources[lang]['pos_tagger'].tag_sents([u.split() for u in utterances]) ] data = zip(range(len(utterances)), speakers, utterances_tagged) communities = detection(data, resources[lang]['stopwords'], config) compressions, graphs = compression(communities, resources[lang]['stopwords'], resources[lang]['word_vectors'], resources[lang]['language_model'], config, lang) summary = selection(compressions, utterances, resources[lang]['stopwords'], resources[lang]['word_vectors'], config, size) # get CoreRank scores dict lists_of_terms = [] for sentence in utterances: lists_of_terms.append( utils.clean_text( copy.copy(sentence), stopwords=resources[lang]['stopwords'], remove_stopwords=config.getboolean('KEYWORDS', 'remove_stopwords'), pos_filtering=config.getboolean('KEYWORDS', 'pos_filtering'), stemming=config.getboolean('KEYWORDS', 'stemming'), lower_case=True # lower_case for CoreRank ) ) keywords = cr.get_core_rank_scores( lists_of_terms, window_size=config.getint('KEYWORDS', 'w'), overspanning=config.getboolean('KEYWORDS', 'overspanning'), weighted=config.getboolean('KEYWORDS', 'weighted') ) if callbackurl is None: return jsonify({'summary': summary, 'keywords': keywords}) else: r = requests.post(callbackurl, json={'summary': summary, 'keywords': keywords}) if r.status_code == requests.codes.ok: return "summary produced succesfully for meeting " + id else: raise RuntimeError() except Exception as e: print e return "got exception trying to run process"
if __name__ == "__main__": config = Config("./config.json") data = Data(config) print("Getting training and testing data...") training_data = data.get_data(data_set_name="train.csv", is_train_data=True) testing_data = data.get_data(data_set_name="test.csv", is_train_data=False) # print(training_data["question_text"]) print("Cleaning data...") training_data["question_text"] = training_data["question_text"].apply( lambda x: clean_text(x, MISPELL_DICT)) testing_data["question_text"] = testing_data["question_text"].apply( lambda x: clean_text(x, MISPELL_DICT)) # print(training_data["question_text"]) print("Getting word embedding...") word2idx = data.word_to_idx(training_data) # print("before : {}".format(len(word2idx))) # print(len(word2idx)) emb_dict = data.get_embedding_dict(embedding_name="newglove.840B.300d.txt", reset_embedding_table=False, word_set=word2idx.keys()) # print("After : {}".format(len(emb_dict))) emb_table = data.get_embedding_table(word2idx=word2idx,
def parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self.senate_base_url, url) with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) bill_id = bill_page.xpath('//*[@class="entry-title"]') if len(bill_id) == 0: print "WARNING: bill summary page is blank! (%s)" % url self.bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip( ) == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title) bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) bill_sponsor_link = table_rows[0][1][0].attrib['href'] if bill_sponsor_link: bill_sponsor_link = '%s%s' % (self.senate_base_url, bill_sponsor_link) bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors if cosponsorOffset == 1: if len(table_rows[2][1]) == 1: # just a name cosponsor = table_rows[2][1][0] bill.add_sponsor( 'cosponsor', cosponsor.text_content(), sponsor_link='%s/%s' % (self.senate_base_url, cosponsor.attrib['href'])) else: # name ... etal try: cosponsor = table_rows[2][1][0] bill.add_sponsor( 'cosponsor', clean_text(cosponsor.text_content()), sponsor_link='%s/%s' % (self.senate_base_url, cosponsor.attrib['href'])) self.parse_cosponsors_from_bill( bill, '%s/%s' % (self.senate_base_url, table_rows[2][1][1].attrib['href'])) except scrapelib.HTTPError: self.bad_urls.append(url) print "WARNING: no bill summary page (%s)" % url actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] actions_link = '%s/%s' % (self.senate_base_url, actions_link_tag.attrib['href']) actions_link = re.sub("content", "print", actions_link) self.parse_house_actions(bill, actions_link) # get bill versions version_tags = bill_page.xpath( '//div[@class="BillDocsSection"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) text_url = '%s%s' % (self.senate_base_url, version_tag[0].attrib['href']) pdf_url = '%s%s' % (self.senate_base_url, version_tag[1].attrib['href']) bill.add_version(version, text_url, pdf_url=pdf_url) self.save_bill(bill)
with open( f"mlruns/{args.EXPERIMENT_ID}/{args.RUN_ID}/artifacts/files/x_char_encoder", "rb") as infile: x_char_encoder = dill.load(infile) with open( f"mlruns/{args.EXPERIMENT_ID}/{args.RUN_ID}/artifacts/files/y_ner_encoder", "rb") as infile: y_ner_encoder = dill.load(infile) with open( f"mlruns/{args.EXPERIMENT_ID}/{args.RUN_ID}/artifacts/files/tag_to_index", "rb") as infile: tag_to_index = dill.load(infile) X_text = clean_text(args.DATA_TEXT) X_text_list_as_is = [X_text.split(' ')] X_text_list = [[word.lower() for word in lst] for lst in X_text_list_as_is] X_tags, tag_to_index_infer = get_POS_tags(X_text_list) X_text_list = trim_list_of_lists_upto_max_len(X_text_list, max_sentence_len) X_text_list_as_is = trim_list_of_lists_upto_max_len( X_text_list_as_is, max_sentence_len) X_tags = trim_list_of_lists_upto_max_len(X_tags, max_sentence_len) alnum, numeric, alpha, digit, lower, title, ascii = enrich_data( X_text_list_as_is) alnum = pad_and_stack_list_of_list( alnum,
'CPU': 1, 'GPU': 0 }) from model import get_model_5stars from utils import clean_text, tokenize_words from config import embedding_size, sequence_length from keras.preprocessing.sequence import pad_sequences import pickle vocab2int = pickle.load(open("data/vocab2int.pickle", "rb")) model = get_model_5stars(len(vocab2int), sequence_length=sequence_length, embedding_size=embedding_size) model.load_weights("results/model_V20_0.38_0.80.h5") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Food Review evaluator") parser.add_argument("review", type=str, help="The review of the product in text") args = parser.parse_args() review = tokenize_words(clean_text(args.review), vocab2int) x = pad_sequences([review], maxlen=sequence_length) print(f"{model.predict(x)[0][0]:.2f}/5")