コード例 #1
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_plenaries(dico, dico_nl, document):
    document.plenaries = []
    for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())),
                           sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))):
        pl = DocumentPlenary()
        pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
        pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
        pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
        pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])

        pl.agenda = []
        if dico[key].get("Calendrier"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        pl.incident = []
        if dico[key].get("Incident"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        pl.save()
        document.plenaries.append(pl)
コード例 #2
0
ファイル: bills.py プロジェクト: marlonkeating/fiftystates
    def parse_house_cosponsors(self, bill, cell):
        # if there's only one sponsor, we don't have to worry about this.
        if (not cell.a.nextSibling or
            not cell.a.nextSibling.nextSibling or
            not 'href' in cell.a.nextSibling.nextSibling):

            cosponsor_dirty = cell.a.em.contents[0]
            cosponsor = clean_text(cosponsor_dirty)
            bill.add_sponsor('cosponsor', cosponsor,
                             sponsor_link=cell.a['href'])
        else:
            # there are several sponsors, and we have to go to the bill text
            bill_text_url = cell.a.nextSibling.nextSibling['href']

            try:
                doc = self.urlopen(bill_text_url)

                # people between (Sponsor) and (Co-Sponsor) are the cosponsors
                m = re.search(r"\(Sponsor\),?(.*)\(Co", doc, re.DOTALL)
                if m:
                    cosponsor_list = clean_text(m.group(1))
                    cosponsor_list = re.split(" ?(?:,| AND ) ?",
                                              cosponsor_list)

                    for cosponsor_dirty in cosponsor_list:
                        cosponsor = clean_text(cosponsor_dirty)
                        bill.add_sponsor('cosponsor', cosponsor)
            except urllib2.HTTPError as e:
                if e.code == 404:
                    # Some of the bill text pages are broken, but the
                    # rest of the bill metadata is valid so just
                    # log the error and move on
                    self.log('404 on %s, continuing' % bill_text_url)
                else:
                    raise e
コード例 #3
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_in_charged_commissions(dico, dico_nl, document):
    document.in_charge_commissions = []
    for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))):
        icc = InChargeCommissions()
        icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
        icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
        icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
        icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
        if dico[key].get("Rapporteur"):
            # FIXME link to actual deputies
            icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t"))

        icc.incident = []
        if dico[key].get("Incident"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        icc.agenda = []
        if dico[key].get("Calendrier"):
            fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
            nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
            for (_date, _type), (_, _type_nl) in zip(fr, nl):
                icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

        if dico[key].get("Rapport"):
            icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])}

        icc.save()
        document.in_charge_commissions.append(icc)
コード例 #4
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_document_chambre(dico, dico_nl, document):
    if not dico.get("Document Chambre"):
        return

    chambre_dico = dico['Document Chambre']
    chambre_dico_nl = dico_nl['Document Kamer']

    document_chambre = DocumentChambre()
    document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
    document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
    document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
    document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
    document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
    document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
    document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
    document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
    document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
    document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
    document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

    _get_authors(chambre_dico, chambre_dico_nl, document_chambre)

    url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

    if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
        document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                             chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

    document_chambre.save()
    document.document_chambre = document_chambre
コード例 #5
0
ファイル: bills.py プロジェクト: h4ck3rm1k3/openstates
 def parse_cosponsors_from_bill(self, bill, url):
     bill_page = self.urlopen(url)
     bill_page = lxml.html.fromstring(bill_page)
     sponsors_text = find_nodes_with_matching_text(
         bill_page, '//p/span', r'\s*INTRODUCED.*')
     if len(sponsors_text) == 0:
         # probably its withdrawn
         return
     sponsors_text = sponsors_text[0].text_content()
     sponsors = clean_text(sponsors_text).split(',')
     # if there are several comma separated entries, list them.
     if len(sponsors) > 1:
         # the sponsor and the cosponsor were already got from the previous
         # page, so ignore those:
         sponsors = sponsors[2::]
         for part in sponsors:
             parts = re.split(r' (?i)and ', part)
             for sponsor in parts:
                 cosponsor_name = clean_text(sponsor)
                 if cosponsor_name != "":
                     cosponsor_name = cosponsor_name.replace(
                         u'\u00a0', " ")  # epic hax
                     for name in re.split(r'\s+AND\s+', cosponsor_name):
                     # for name in cosponsor_name.split("AND"):
                         name = name.strip()
                         if name:
                             bill.add_sponsor('cosponsor', name)
コード例 #6
0
ファイル: session.py プロジェクト: manuelcortez/socializer
def add_text(status):
	""" This shorts the text to 140 characters for displaying it in the list control."""
	message = ""
	if status.has_key("copy_history"):
		txt = status["copy_history"][0]["text"]
	else:
		txt = status["text"]
	if len(txt) < 140:
		message = utils.clean_text(txt)
	else:
		message = utils.clean_text(txt[:139])
	return message
コード例 #7
0
ファイル: bills.py プロジェクト: PamelaM/openstates
 def parse_cosponsors_from_bill(self, bill, url):
     with self.urlopen(url) as bill_page:
         bill_page = lxml.html.fromstring(bill_page)
         sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*')
         if len(sponsors_text) == 0:
             # probably its withdrawn
             return
         sponsors_text = sponsors_text[0].text_content()
         sponsors = clean_text(sponsors_text).split(',')
         if len(sponsors) > 1: # if there are several comma separated entries, list them.
             # the sponsor and the cosponsor were already got from the previous page, so ignore those:
             sponsors = sponsors[2::]
             for part in sponsors:
                 parts = re.split(r' (?i)and ',part)
                 for sponsor in parts:
                     bill.add_sponsor('cosponsor', clean_text(sponsor))
コード例 #8
0
def _build_sub_section(i, dico):
    sub_section = clean_text(i.td.b.text)
    if dico.get(sub_section):
        raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section]))
    dico[sub_section] = AccessControlDict()
    dico[sub_section]["head"] = i('td')[1]
    return sub_section
コード例 #9
0
ファイル: taggers.py プロジェクト: chrisleewashere/nlpnet
 def tag_tokens(self, tokens, no_repeats=False):
     """
     Runs the SRL process on the given tokens.
     
     :param tokens: a list of tokens (as strings)
     :param no_repeats: whether to prevent repeated argument labels
     :returns: a list of lists (one list for each sentence). Sentences have tuples 
         (all_tokens, predicate, arg_structure), where arg_structure is a dictionary 
         mapping argument labels to the words it includes.
     """
     tokens_obj = [attributes.Token(utils.clean_text(t, False)) for t in tokens]
     converted_bound = np.array([self.boundary_reader.converter.convert(t) 
                                 for t in tokens_obj])
     converted_class = np.array([self.classify_reader.converter.convert(t) 
                                 for t in tokens_obj])
     
     pred_positions = self.find_predicates(tokens_obj)
     
     # first, argument boundary detection
     # the answer includes all predicates
     answers = self.boundary_nn.tag_sentence(converted_bound, pred_positions)
     boundaries = [[self.boundary_itd[x] for x in pred_answer] 
                   for pred_answer in answers]
     arg_limits = [utils.boundaries_to_arg_limits(pred_boundaries) 
                   for pred_boundaries in boundaries]
     
     # now, argument classification
     answers = self.classify_nn.tag_sentence(converted_class, 
                                             pred_positions, arg_limits,
                                             allow_repeats=not no_repeats)
     arguments = [[self.classify_itd[x] for x in pred_answer] 
                  for pred_answer in answers]
     
     structures = _group_arguments(tokens, pred_positions, boundaries, arguments)
     return SRLAnnotatedSentence(tokens, structures)
コード例 #10
0
ファイル: data_cleanup.py プロジェクト: nhu2000/Project-2
 def df_transform(self, terms):    
     self.df[pd.isnull(self.df['Comment'])] = ""
     self.df = self.df.drop_duplicates('Comment')
     self.df['date'] = self.df['date'].apply(lambda x : unix_convert(x))
     self.df['Comment'] = self.df['Comment'].apply(lambda x: clean_text(str(x)))
     self.df['Sentiment_raw'] = self.df.apply(lambda row: sentiment(row['Comment']), axis = 1)
     self.df['Sentiment'] = self.df.apply(lambda row: sentiment_new(row['Comment'], terms), axis = 1)
     self.df['State'] = self.df.apply(lambda row: state_label(str(row['Locations'])), axis = 1)
     self.df = pd.merge(self.df, self.longlat, how='left', on='State')
コード例 #11
0
ファイル: realtime.py プロジェクト: kelvan/gotoVienna
    def parse_stations(self, html):
        bs = BeautifulSoup(html)
        tables = bs.findAll('table', {'class':'show_fw'})
        st = {}

        for i in range(2):
            trs = tables[i].findAll('tr')
            direction = clean_text(trs[0].text.replace('Fahrtrichtung', ''))
            
            sta = []
            for tr in trs[2:-1]:
                if tr.a:
                    sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href']))
                else:
                    sta.append((clean_text(tr.text), None))

            st[direction] = sta
        return st
コード例 #12
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre):
    if chambre_dico.get('Document(s) suivant(s)'):
        for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])):
            logger.debug("add pdf %s" % clean_text(d[0].font.text))
            doc = OtherDocumentChambrePdf()
            doc.url = d[0].a['href'] if d[0].a else d[0].td.text
            doc.type["fr"] = clean_text(d[0].font.text)
            doc.type["nl"] = clean_text(d_nl[0].font.text)
            doc.distribution_date = d[1]('td')[-1].text
            for dep, dep_nl in zip(d[2:], d_nl[2:]):
                if dep.a:
                    lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0]
                    deputy = Deputy.objects.get(lachambre_id=lachambre_id)
                    doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
                else:
                    doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
            doc.save()
            document_chambre.other_pdfs.append(doc)
コード例 #13
0
def _build_first_level(i, dico):
    key = clean_text(i.td.text)
    # we can get severals Moniter erratum
    if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
        if not dico.get(key):
            dico[key] = []
        dico[key].append(i('td')[1])
    else:
        if dico.get(key):
            raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key]))
        dico[key] = i('td')[1]
コード例 #14
0
def _build_pdf_sub_section(i, dico, sub_section):
    key = clean_text(i.td.text)
    # we can have a list on joined documents
    if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'):
        if not dico[sub_section].get(key):
            dico[sub_section][key] = []
        dico[sub_section][key].append(i('td')[1])
    elif dico[sub_section].get(key):
        raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key]))
    else:
        dico[sub_section][key] = i('td')[1]
コード例 #15
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_competences(dico, dico_nl, document):
    # FIXME: meh, DRY
    if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
        document.timeline = []
        for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
                                                   [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
            logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
    elif dico.get(u"Compétence"):
        document.timeline = []
        for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
            logger.debug("append time line %s %s %s" % (_date, _title, ""))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
    elif dico_nl.get(u"Bevoegdheid"):
        document.timeline = []
        for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
            logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
            document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
    if dico.get("Analyse des interventions"):
        document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
コード例 #16
0
ファイル: autotag.py プロジェクト: jvalansi/autotag
 def get_document_features(self,document):
     '''
     Extract features from the document.
     Current supported features are the existence of a word in the document  
     
     :param document: a dictionary with 'text' key and 'tags' key.
     '''
     document = clean_text(document)
     document_words = set(document.split())
     features = {}
     for word in self.get_word_features():
         features['contains(%s)' % word] = (word in document_words)
     return features
コード例 #17
0
def document_pdf_part_cutter(soup):
    result = []
    blob = [soup('tr')[0]]
    for i in soup('tr')[1:]:
        if not clean_text(i.text):
            continue
        if not i.img or not i.img.get("class") or i.img["class"] != "picto":
            blob.append(i)
        else:
            result.append(blob)
            blob = [i]

    result.append(blob)
    return result
コード例 #18
0
ファイル: autotag.py プロジェクト: jvalansi/django-autotag
def hk_freq(data_dir, hk_dir):
    print("hk freq")
    data = get_json_data(data_dir)
    at = AutoTag()
    for entry in data:
        entry["text"] = clean_text(entry["text"])
    if not os.path.isdir(hk_dir):
        os.mkdir(hk_dir)
    with open(hk_dir + "total", "w") as f:
        pass
    word_count = at.count_data([w for entry in data for w in entry["text"].split()], hk_dir + "total")
    words = [w.encode("utf-8") for w, c in word_count if c > 40]
    with open(hk_dir + "freqs.csv", "wb") as csvfile:
        #         data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40]
        w = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        w.writerow([u"HK"] + words)
    #         csvfile.write(','.join([u'HK']+words) + '\n')

    hkwords = {}
    data_json = get_json(data_dir)
    for json_entry in data_json:
        if json_entry["model"] != "facebook_feeds.facebook_feed":
            continue
        name = json_entry["fields"]["name"]
        print(name)
        if not name:
            continue
        name = name.encode("utf-8")
        word_count = at.count_data(
            [w for entry in data for w in entry["text"].split() if entry["feed"] == json_entry["pk"]], hk_dir + name
        )
        word_dict = {w.encode("utf-8"): c for w, c in word_count}
        hkwords[name] = []
        for word in words:
            if word not in word_dict:
                hkwords[name].append(str(0))
            else:
                hkwords[name].append(str(word_dict[word]))
        with open(hk_dir + "freqs.csv", "a") as csvfile:
            writer = csv.writer(csvfile, delimiter=",")
            #             writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([name] + hkwords[name])

    with open(hk_dir + "freqs_t.csv", "a") as csvfile:
        writer = csv.writer(csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for name in hkwords:
            writer.writerow([name] + hkwords[name])
コード例 #19
0
ファイル: taggers.py プロジェクト: chrisleewashere/nlpnet
 def tag_tokens(self, tokens):
     """
     Tags a given list of tokens. 
     
     Tokens should be produced with the nlpnet tokenizer in order to 
     match the entries in the vocabulary. If you have non-tokenized text,
     use POSTagger.tag(text).
     
     :param tokens: a list of strings
     :returns: a list of strings (the tags)
     """
     converter = self.reader.converter
     converted_tokens = np.array([converter.convert(utils.clean_text(token, False)) 
                                  for token in tokens])
     answer = self.nn.tag_sentence(converted_tokens)
     tags = [self.itd[tag] for tag in answer]
     return tags
コード例 #20
0
def compile_episode_transcript(trans_id, db):
    """
    Uses the Audiosearch database to compiles a transcript for the podcast
    episode associated with trans_id.

    Parameters
    ----------
    trans_id : int
        The Audiosearch transcript ID for a particular podcast episode as
        found using find_episode_transcript_ids

    db : database connection
        The connection to the Audiosearch Postgres database

    Returns
    -------
    transcript : np.array of shape (n, 4)
        An array containing the transcript for the podcast episode associated
        with trans_id. Each row corresponds to a line in the transcript, and
        the columns correspond to [start_time, end_time, utterance, speaker_id]
    """
    transcript = []
    trans = get_transcript(db, trans_id).sort_values(by="start_time")

    # line contents: [start_time, end_time, utterance, speaker_id]
    for idx in range(trans.shape[0]):
        speaker = trans['speaker_id'][idx]
        text = clean_text(trans['text'][idx])
        start = trans['start_time'][idx]/60.
        end = trans['end_time'][idx]/60.

        if speaker is None or np.isnan(speaker):
          speaker = -1

        # this happens a lot in the audiosearch db..
        if text == '.':
          continue

        line = [start, end, text, speaker]

        # skip duplicate lines
        if idx > 0 and line[2] == transcript[-1][2]:
          continue

        transcript.append(line)
    return np.asarray(transcript)
コード例 #21
0
ファイル: autotag.py プロジェクト: jvalansi/autotag
def hk_freq(data_dir, hk_dir):
    print('hk freq')
    data = get_json_data(data_dir)
    at = AutoTag()
    for entry in data:
        entry['text'] = clean_text(entry['text'])
    if not os.path.isdir(hk_dir):
        os.mkdir(hk_dir)
    with open(hk_dir+'total', 'w') as f:
        pass
    word_count = at.count_data([w for entry in data for w in entry['text'].split()],hk_dir+'total')
    words = [w.encode('utf-8') for w,c in word_count if c > 40]
    with open(hk_dir+'freqs.csv', 'wb') as csvfile:
#         data_encoded = [w.encode('utf-8') for w,c in word_count if c > 40]
        w = csv.writer(csvfile, delimiter = ',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        w.writerow([u'HK']+words)
#         csvfile.write(','.join([u'HK']+words) + '\n')
   
    hkwords = {}
    data_json = get_json(data_dir)
    for json_entry in data_json:
        if json_entry['model'] != "facebook_feeds.facebook_feed":
            continue
        name = json_entry['fields']['name']
        print(name) 
        if not name:
            continue
        name = name.encode('utf-8')
        word_count = at.count_data([w for entry in data for w in entry['text'].split() if entry["feed"] == json_entry['pk']],hk_dir+name)
        word_dict = {w.encode('utf-8'):c for w,c in word_count}
        hkwords[name] = []
        for word in words:
            if word not in word_dict:
                hkwords[name].append(str(0))
            else:
                hkwords[name].append(str(word_dict[word])) 
        with open(hk_dir+'freqs.csv', 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
#             writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow([name]+hkwords[name])
     
    
    with open(hk_dir+'freqs_t.csv', 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for name in hkwords:
            writer.writerow([name]+hkwords[name])
コード例 #22
0
ファイル: get_legislation.py プロジェクト: rcadby/fiftystates
    def parse_senate_cosponsors(self, bill, url):
        bill.add_source(url)
        with self.soup_context(url) as cosponsors_page:
            # cosponsors are all in a table
            cosponsor_table = cosponsors_page.find(id="dgCoSponsors")
            cosponsors = cosponsor_table.findAll("tr")

            for cosponsor_row in cosponsors:
                # cosponsors include district, so parse that out
                cosponsor_string = cosponsor_row.font.contents[0]
                cosponsor = clean_text(cosponsor_string)

                # they give us a link to the congressperson, so we might
                # as well keep it.
                cosponsor_url = cosponsor_row.a.href

                bill.add_sponsor('cosponsor', cosponsor,
                                 sponsor_link=cosponsor_url)
コード例 #23
0
ファイル: bills.py プロジェクト: PamelaM/openstates
    def parse_senate_cosponsors(self, bill, url):
        bill.add_source(url)
        with self.urlopen(url) as cosponsors_page:
            cosponsors_page = lxml.html.fromstring(cosponsors_page)
            # cosponsors are all in a table
            cosponsors = cosponsors_page.xpath('//table[@id="dgCoSponsors"]/tr/td/a')
            #print "looking for cosponsors = %s" % cosponsors

            for cosponsor_row in cosponsors:
                # cosponsors include district, so parse that out
                cosponsor_string = cosponsor_row.text_content()
                cosponsor = clean_text(cosponsor_string)
                cosponsor = cosponsor.split(',')[0]

                # they give us a link to the congressperson, so we might
                # as well keep it.
                cosponsor_url = cosponsor_row.attrib['href']

                bill.add_sponsor('cosponsor', cosponsor, sponsor_link=cosponsor_url)
コード例 #24
0
ファイル: autotag.py プロジェクト: jvalansi/autotag
 def test_doc(self,document,tags,thresh=0.3):
     '''
     test which tags should tag the given document 
     
     :param document: an entry with 'text' key.
     :param tags: tags to test.
     :param thresh: threshold for the tag probability.
     :return probs: list of the most probable tags and their probability.
     '''
     document["text"] = clean_text(document["text"])
     document["features"] = self.get_document_features(document["text"])
     probs = []
     for tag in tags:
         classifier = self.load_classifier(tag)
         if not classifier:
             continue
         prob = classifier.prob_classify(document["features"])
         if prob.prob(True) > thresh:
             probs.append((prob.prob(True),tag))
     probs = sorted(probs,reverse=True)
     return probs
コード例 #25
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_document_senat(dico, dico_nl, document):
    if not dico.get(u"Document Sénat"):
        return

    senat_dico = dico[u"Document Sénat"]
    senat_dico_nl = dico_nl[u"Document Senaat"]

    document_senat = DocumentSenat()
    document_senat.deposition_date = senat_dico[u"Date de dépôt"].text
    document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin")
    document_senat.type["fr"] = senat_dico[u"Type de document"].text
    document_senat.type["nl"] = senat_dico_nl[u"Document type"].text
    document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ')
    document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ')
    document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)"))
    document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut')
    document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status')

    url, tipe, session = clean_text(str(senat_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    if senat_dico.get('Document(s) suivant(s)'):
        for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])):
            logger.debug("add pdf %s" % clean_text(d[0].font.text))
            doc = OtherDocumentSenatPdf()
            doc.url = d[0].a['href'] if d[0].a else d[0].td.text
            doc.type["fr"] = clean_text(d[0].font.text)
            doc.type["nl"] = clean_text(d_nl[0].font.text)
            doc.date = d[0]('td')[-1].contents[0]
            doc.authors = []
            for dep, dep_nl in zip(d[1:], d_nl[1:]):
                doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
            doc.save()
            document_senat.other_pdfs.append(doc)

    document_senat.save()
    document.document_senat = document_senat
コード例 #26
0
ファイル: documents.py プロジェクト: olethanh/dierentheater
def _get_first_level_data(dico, dico_nl, document):
    document.deposition_date = get_text_else_blank(dico, u"Date de dépôt")
    document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution"))
    document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet"))
    if dico.get("Descripteur Eurovoc principal"):
        document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text
    if dico.get("Eurovoc-hoofddescriptor"):
        document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text
    document.vote_date = get_text_else_blank(dico, "Vote Chambre")
    document.law_date = get_text_else_blank(dico, "Date de la loi")
    document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°")
    document.moniteur_date = get_text_else_blank(dico, u"Date moniteur")
    document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat")
    document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature")

    if dico.get("Etat d'avancement"):
        document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0])
        document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None
    if dico.get("Stand van zaken"):
        document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0])
        document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None

    if dico.get("Descripteurs Eurovoc"):
        document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|"))
    if dico.get("Eurovoc descriptoren"):
        document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|"))
    if dico.get("Candidats-descripteurs Eurovoc"):
        document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|"))
    if dico.get("Eurovoc kandidaat-descriptoren"):
        document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|"))
    if dico.get(u"Mots-clés libres"):
        document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|"))
    if dico.get(u"Vrije trefwoorden"):
        document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|"))
    if dico.get("Documents principaux"):
        document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents))
    if dico.get("Hoodfdocumenten"):
        document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))
コード例 #27
0
    def buildTextBlocks(self, filterPos=None, days_to_avoid=[]):
        MIN_TOKENS = 10
        THRESHOLD = 0.4

        textBlocksPerDay = {}
        blockVectorsPerDay = {}

        for date, listMsgs in self.iChat.conversation.iteritems():
            #print date
            if date in days_to_avoid:
                continue

            textBlocksPerDay[date] = []
            blockVectorsPerDay[date] = []

            listMsgs = self.iChat.conversation[date]
            #print "Num msg",len(listMsgs)
            lastVector = None
            acumTokens = 0
            acumText = []
            textBlocks = []
            blockVector = None
            blockVectors = []
            for idx, dictMsg in enumerate(listMsgs):

                text = dictMsg["text"]
                if not filterPos:
                    cleanTokens = utils.clean_text(text)
                else:
                    cleanTokens = utils.clean_text(text, True, filterPos)

                acumTokens += len(cleanTokens)
                acumText.extend(cleanTokens)

                if acumTokens >= MIN_TOKENS:
                    if not blockVector:
                        vector = self.iSQL.getMsgVector(" ".join(acumText))
                        blockVector = vector
                    else:
                        vector = self.iSQL.getMsgVector(text)
                        if not vector:
                            continue

                        distance = self.iSQL.distance(vector, blockVector)
                        #print distance
                        if distance > THRESHOLD:
                            textBlocks.append(acumText)
                            blockVectors.append(blockVector)
                            blockVector = []
                            #print acumText
                            acumText = []
                            acumTokens = 0
                        else:
                            #print "aggregating"
                            blockVector = self.iSQL.aggregateVectors(
                                blockVector, vector)

            blockVectorsPerDay[date] = blockVectors
            textBlocksPerDay[date] = textBlocks

            #print len(textBlocks)

        return blockVectorsPerDay, textBlocksPerDay
コード例 #28
0
ファイル: evaluate.py プロジェクト: ololo123321/digital-peter
import os
from argparse import ArgumentParser

from utils import evaluate, clean_text

if __name__ == "__main__":
    """
    python evaluate.py \
        --predictions_dir=./predictions/valid_predictions \
        --answers_dir=./data/valid_texts
    """
    parser = ArgumentParser()
    parser.add_argument("--predictions_dir")
    parser.add_argument("--answers_dir")
    parser.add_argument("--items_to_display", type=int, default=10)
    args = parser.parse_args()

    true_texts = []
    pred_texts = []
    for file in os.listdir(args.predictions_dir):
        true_texts.append(
            clean_text(open(os.path.join(args.answers_dir, file)).readline()))
        pred_texts.append(
            open(os.path.join(args.predictions_dir, file)).readline())

    evaluate(true_texts=true_texts,
             pred_texts=pred_texts,
             top_k=args.items_to_display)
コード例 #29
0
def eval(context, question):
    with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \
         open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \
         open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \
         open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb:
        word2idx = pickle.load(wi)
        char2idx = pickle.load(ci)
        word_embedding_matrix = pickle.load(wb)
        char_embedding_matrix = pickle.load(cb)

    # transform them into Tensors
    word_embedding_matrix = torch.from_numpy(
        np.array(word_embedding_matrix)).type(torch.float32)
    char_embedding_matrix = torch.from_numpy(
        np.array(char_embedding_matrix)).type(torch.float32)
    idx2word = dict([(y, x) for x, y in word2idx.items()])

    context = clean_text(context)
    context = [w for w in word_tokenize(context) if w]

    question = clean_text(question)
    question = [w for w in word_tokenize(question) if w]

    if len(context) > config.max_len_context:
        print("The context is too long. Maximum accepted length is",
              config.max_len_context, "words.")
    if max([len(w) for w in context]) > config.max_len_word:
        print("Some words in the context are longer than", config.max_len_word,
              "characters.")
    if len(question) > config.max_len_question:
        print("The question is too long. Maximum accepted length is",
              config.max_len_question, "words.")
    if max([len(w) for w in question]) > config.max_len_word:
        print("Some words in the question are longer than",
              config.max_len_word, "characters.")
    if len(question) < 3:
        print(
            "The question is too short. It needs to be at least a three words question."
        )

    context_idx = np.zeros([config.max_len_context], dtype=np.int32)
    question_idx = np.zeros([config.max_len_question], dtype=np.int32)
    context_char_idx = np.zeros([config.max_len_context, config.max_len_word],
                                dtype=np.int32)
    question_char_idx = np.zeros(
        [config.max_len_question, config.max_len_word], dtype=np.int32)

    # replace 0 values with word and char IDs
    for j, word in enumerate(context):
        if word in word2idx:
            context_idx[j] = word2idx[word]
        else:
            context_idx[j] = 1
        for k, char in enumerate(word):
            if char in char2idx:
                context_char_idx[j, k] = char2idx[char]
            else:
                context_char_idx[j, k] = 1

    for j, word in enumerate(question):
        if word in word2idx:
            question_idx[j] = word2idx[word]
        else:
            question_idx[j] = 1
        for k, char in enumerate(word):
            if char in char2idx:
                question_char_idx[j, k] = char2idx[char]
            else:
                question_char_idx[j, k] = 1

    model = BiDAF(word_vectors=word_embedding_matrix,
                  char_vectors=char_embedding_matrix,
                  hidden_size=config.hidden_size,
                  drop_prob=config.drop_prob)
    try:
        if config.cuda:
            model.load_state_dict(
                torch.load(os.path.join(config.squad_models,
                                        "model_final.pkl"))["state_dict"])
        else:
            model.load_state_dict(
                torch.load(
                    os.path.join(config.squad_models, "model_final.pkl"),
                    map_location=lambda storage, loc: storage)["state_dict"])
        print("Model weights successfully loaded.")
    except:
        pass
        print(
            "Model weights not found, initialized model with random weights.")
    model.to(device)
    model.eval()
    with torch.no_grad():
        context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device)

        pred1, pred2 = model(context_idx, context_char_idx, question_idx,
                             question_char_idx)
        starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False)
        prediction = " ".join(context[starts.item():ends.item() + 1])

    return prediction
コード例 #30
0
ファイル: realtime.py プロジェクト: kelvan/gotoVienna
    def parse_departures(self, html):
        bs = BeautifulSoup(html)
        dep = []

        # Check for error messages
        msg = bs.findAll('span', {'class': 'rot fett'})
        if msg and len(msg) > 0 and unicode(msg[0].text).find(u'technischen St') > 0:
            print '\n'.join(map(lambda x: x.text.replace('&nbsp;', ''), msg))
            return []
        
        errtable = bs.find('table', {'class':'errortable'})
        if errtable and clean_text(errtable.text):
            print "Errortable found"
            print errtable.text
            return []

        if bs.table and bs.table.tr:
            st_td = bs.table.tr.findAll('td')
        
            if st_td:
                station = clean_text(st_td[-1].text)
            else:
                print "Unexpected Error: Stationname not found"
                print "Debug:", st_td.encode('UTF-8')
        else:
            print "Unexpected Error: table or tr not found"
            print bs
            return []
        
        # zusatztext crap
        zt = bs.find('td', {'class':'zusatztext'})
        if zt:
            ma = ZUSATZTEXT_REGEX.search(zt.text)
            if ma:
                line = ma.group(1)
                direction = ma.group(2)
                if direction == direction.upper():
                    direction = direction.capitalize()
                tim = int(ma.group(3))
                d = Departure(line=line, direction=direction,
                              lowfloor=True, station=station, time=tim)
                dep.append(d)
            else:
                print zt.text
        
        table = bs.find('table', {'class':'imagetable'})
        if not table:
            print "table not found"
            return []
        
        if errtable:
            print "Warning: Empty errortable found"
            return dep
        
        trs = table.findAll('tr')
        
        for tr in trs[1:]:
            tds = tr.findAll('td')
            line = clean_text(tds[0].text)
            direction = clean_text(tds[1].text)
            
            if direction.startswith(line):
                direction = direction.lstrip(line).strip()
                
            if direction == direction.upper():
                direction = direction.capitalize()
            
            lf_img = tds[-1].img
            
            lowfloor = lf_img and lf_img.has_key('alt')
            
            d = {'line': line,
                 'direction': direction,
                 'lowfloor': lowfloor,
                 'station': station}

            # parse time
            tim = clean_text(tds[2].text)
            dts = DELTATIME_REGEX.search(tim)
            abs = ABSTIME_REGEX.search(tim)
            
            if tim.find(u'...in K\xfcrze') >= 0:
                d['time'] = 0
            elif abs:
                d['time'] = calc_datetime(abs.group(1))
            elif tim.isdigit():
                d['time'] = int(tim)
            elif dts:
                # is timedelta
                d['time'] = int(dts.group(1))
            else:
                print "Error parsing time:", tim
                continue

            dep.append(Departure(**d))

        return dep
コード例 #31
0
ファイル: bills.py プロジェクト: kevinthew/openstates
    def parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = "%s/%s" % (self.senate_base_url, url)

        with self.urlopen(url) as bill_page:
            bill_page = lxml.html.fromstring(bill_page)

            bill_id = bill_page.xpath('//*[@class="entry-title"]')
            if len(bill_id) == 0:
                self.log("WARNING: bill summary page is blank! (%s)" % url)
                self.bad_urls.append(url)
                return
            bill_id = bill_id[0].text_content()
            bill_id = clean_text(bill_id)

            bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content()
            bill_desc = clean_text(bill_desc)

            table_rows = bill_page.xpath("//table/tr")
            # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor:
            cosponsorOffset = 0
            if table_rows[2][0].text_content().strip() == "Co-Sponsor:":
                cosponsorOffset = 1

            lr_label_tag = table_rows[3 + cosponsorOffset]
            assert lr_label_tag[0].text_content().strip() == "LR Number:"
            bill_lr = lr_label_tag[1].text_content()

            lastActionOffset = 0
            if table_rows[4 + cosponsorOffset][0].text_content().strip() == "Governor Action:":
                lastActionOffset = 1
            official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
            assert official_title_tag[0].text_content().strip() == "Bill String:"
            official_title = official_title_tag[1].text_content()

            # could substitute the description for the name,
            # but keeping it separate for now.

            bill_type = "bill"
            triplet = bill_id[:3]
            if triplet in bill_types:
                bill_type = bill_types[triplet]

            subs = []
            bid = bill_id.replace(" ", "")

            if bid in self.subjects:
                subs = self.subjects[bid]
                self.log("With subjects for this bill")

            self.log(bid)

            bill = Bill(
                session,
                "lower",
                bill_id,
                bill_desc,
                bill_url=url,
                bill_lr=bill_lr,
                official_title=official_title,
                type=bill_type,
                subjects=subs,
            )
            bill.add_source(url)

            bill_sponsor = clean_text(table_rows[0][1].text_content())
            try:
                bill_sponsor_link = table_rows[0][1][0].attrib["href"]
            except IndexError:
                return

            if bill_sponsor_link:
                bill_sponsor_link = "%s%s" % (self.senate_base_url, bill_sponsor_link)

            bill.add_sponsor("primary", bill_sponsor, sponsor_link=bill_sponsor_link)

            # check for cosponsors
            if cosponsorOffset == 1:
                if len(table_rows[2][1]) == 1:  # just a name
                    cosponsor = table_rows[2][1][0]
                    bill.add_sponsor(
                        "cosponsor",
                        cosponsor.text_content(),
                        sponsor_link="%s/%s" % (self.senate_base_url, cosponsor.attrib["href"]),
                    )
                else:  # name ... etal
                    try:
                        cosponsor = table_rows[2][1][0]
                        bill.add_sponsor(
                            "cosponsor",
                            clean_text(cosponsor.text_content()),
                            sponsor_link="%s/%s" % (self.senate_base_url, cosponsor.attrib["href"]),
                        )
                        self.parse_cosponsors_from_bill(
                            bill, "%s/%s" % (self.senate_base_url, table_rows[2][1][1].attrib["href"])
                        )
                    except scrapelib.HTTPError as e:
                        self.log("WARNING: " + str(e))
                        self.bad_urls.append(url)
                        self.log("WARNING: no bill summary page (%s)" % url)

            actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
            actions_link = "%s/%s" % (self.senate_base_url, actions_link_tag.attrib["href"])
            actions_link = re.sub("content", "print", actions_link)
            self.parse_house_actions(bill, actions_link)

            # get bill versions
            doc_tags = bill_page.xpath('//div[@class="BillDocsSection"][1]/span')
            for doc_tag in reversed(doc_tags):
                doc = clean_text(doc_tag.text_content())
                text_url = "%s%s" % (self.senate_base_url, doc_tag[0].attrib["href"])
                bill.add_document(doc, text_url, mimetype="text/html")

            # get bill versions
            version_tags = bill_page.xpath('//div[@class="BillDocsSection"][2]/span')
            for version_tag in reversed(version_tags):
                version = clean_text(version_tag.text_content())
                text_url = "%s%s" % (self.senate_base_url, version_tag[0].attrib["href"])
                pdf_url = "%s%s" % (self.senate_base_url, version_tag[1].attrib["href"])
                bill.add_version(version, text_url, pdf_url=pdf_url, on_duplicate="use_new")
        self.save_bill(bill)
コード例 #32
0
def retrieve(dataset, index, filename=None):
    if index >= len(dataset):
        st.error(f"Index {index} exceeds dataset length.")

    eval_dataset = None
    if filename:
        # TODO Handle this through dedicated fields
        if "cnn_dailymail" in filename:
            eval_dataset = "cnndm"
        elif "xsum" in filename:
            eval_dataset = "xsum"

    data = dataset[index]
    id_ = data.get('id', '')

    try:
        document = data['spacy:document']
    except KeyError:
        if not is_lg:
            st.error(
                "'en_core_web_lg model' is required unless loading from cached file."
                "To install: 'python -m spacy download en_core_web_lg'")
        try:
            text = data['document']
        except KeyError:
            text = data['article']
        if not text:
            st.error("Document is blank")
            return
        document = nlp(text if args.no_clean else clean_text(text))
    document._.name = "Document"
    document._.column = "document"

    try:
        reference = data['spacy:summary:reference']

    except KeyError:
        if not is_lg:
            st.error(
                "'en_core_web_lg model' is required unless loading from cached file."
                "To install: 'python -m spacy download en_core_web_lg'")
        try:
            text = data['summary'] if 'summary' in data else data[
                'summary:reference']
        except KeyError:
            text = data.get('highlights')
        if text:
            reference = nlp(text if args.no_clean else clean_text(text))
        else:
            reference = None
    if reference is not None:
        reference._.name = "Reference"
        reference._.column = "summary:reference"

    model_names = set()
    for k in data:
        m = re.match('(preprocessed_)?summary:(?P<model>.*)', k)
        if m:
            model_name = m.group('model')
            if model_name != 'reference':
                model_names.add(model_name)

    preds = []
    for model_name in model_names:
        try:
            pred = data[f"spacy:summary:{model_name}"]
        except KeyError:
            if not is_lg:
                st.error(
                    "'en_core_web_lg model' is required unless loading from cached file."
                    "To install: 'python -m spacy download en_core_web_lg'")
            text = data[f"summary:{model_name}"]
            pred = nlp(text if args.no_clean else clean_text(text))

        parts = model_name.split("-")
        primary_sort = 0
        if len(parts) == 2:
            model, train_dataset = parts
            if train_dataset == eval_dataset:
                formatted_model_name = model.upper()
            else:
                formatted_model_name = f"{model.upper()} ({train_dataset.upper()}-trained)"
                if train_dataset in ["xsum", "cnndm"]:
                    primary_sort = 1
                else:
                    primary_sort = 2
        else:
            formatted_model_name = model_name.upper()
        pred._.name = formatted_model_name
        pred._.column = f"summary:{model_name}"
        preds.append(((primary_sort, formatted_model_name), pred))

    preds = [pred for _, pred in sorted(preds)]

    return Instance(
        id_=id_,
        document=document,
        reference=reference,
        preds=preds,
        data=data,
    )
コード例 #33
0
def get_skills(custom_entities, text):

    exp = get_head_sections(text)['skills']
    try:
        model_dir = config.skill_model_dir
        nlp2 = spacy.load(model_dir)
        doc2 = nlp2(exp)
        entities = utils.extract_entities(doc2)
        for key, val in entities.items():
            entities[key] = utils.clean_text(val)
        if len(entities['Skills']) > 4:
            return entities['Skills']
    except:
        print("No skills entitiy")

    skills = list()
    exp = []
    lines = [lin.strip() for lin in text.split('\n')]

    for ind, line in enumerate(lines):
        if len(line.split()) < 4 and ('skills' in line.lower()):
            try:
                for i in range(1, 5):
                    exp.append(lines[ind + i])
            except:
                exp.append(lines[ind + i])
    description = [
        e for e in exp if not e[:5].lower() == 'level' and len(e) > 0
    ]
    exp = get_head_sections(text)['skills']
    if len(exp.split('\n')) < 4:
        exp = ''
        for ind, line in enumerate(lines):
            if len(line.split()) < 8 and ('skills' in line.lower()
                                          or 'expertise' in line.lower()
                                          or 'strength' in line.lower()
                                          or 'proficiency' in line.lower()):
                try:
                    try:
                        try:
                            for i in range(1, 25):
                                exp += lines[ind + i] + ' '
                        except:
                            for i in range(1, 15):
                                exp += lines[ind + i] + ' '
                    except:
                        for i in range(1, 10):
                            exp += lines[ind + i] + ' '
                except:
                    exp += lines[ind + 1] + ' ' + lines[ind + 2]

    exp = exp.lower()
    skillset = list()
    skill_dict = {}
    nlp_text = nlp(exp)
    noun_chunks = list(nlp_text.noun_chunks)
    tokens = [token.text for token in nlp_text if not token.is_stop]
    data = pd.read_csv(config.skill_csv)

    bigrams = utils.extract_ngrams(exp, 2)

    skills = list(data.columns.values)
    custom_skillset = get_custom_skills(text)
    try:

        # check for one-grams
        for token in tokens:
            if token.lower() in skills:
                skillset.append(token)

        # check for bi-grams and tri-grams
        for token in noun_chunks:
            token = token.text.lower().strip()
            if token in skills and token not in skillset:
                skillset.append(token)
        for token in bigrams:
            if token.lower() in skills and token not in skillset:
                skillset.append(token)
        skillset = [i.lower() for i in set([i.lower() for i in skillset])]
        if len(skillset) < 5:
            for skill in custom_skillset:
                if skill not in skillset:
                    skillset.append(skill)
            try:
                skillset = skillset[:5]
            except:
                pass
        skillset = [i.capitalize() for i in set([i.lower() for i in skillset])]
        skillset = [i.strip() for i in skillset if not i in (' a ')]
        skillset = sorted(skillset, key=len, reverse=True)
        if len(skillset) < 3:
            skillset = get_mod_skills(text)
        return skillset
    except:
        if len(skillset) < 3:
            skillset = get_mod_skills(text)
        return skillset
コード例 #34
0
ファイル: bills.py プロジェクト: tamilyn/openstates
    def parse_house_bill(self, url, session):
        url = re.sub("content", "print", url)

        with self.urlopen(url) as bill_page_data:
            bill_page = BeautifulSoup(bill_page_data)
            header_table = bill_page.table

            # get all the info needed to record the bill
            bill_id = header_table.b.contents[0]
            bill_id = clean_text(bill_id)

            bill_desc = header_table.findAll('td')[1].contents[0]
            bill_desc = clean_text(bill_desc)

            lr_label_tag = bill_page.find(text=re.compile("LR Number:"))
            bill_lr = lr_label_tag.next.contents[0].strip()

            # could substitute the description for the name,
            # but keeping it separate for now.
            bill = Bill(session,
                        'lower',
                        bill_id,
                        bill_desc,
                        bill_url=url,
                        bill_lr=bill_lr)
            bill.add_source(url)

            # get the sponsors and cosponsors
            sponsor_dirty = bill_page.em.contents[0]
            m = re.search("(.*)\(.*\)", sponsor_dirty)
            if m:
                bill_sponsor = m.group(1)
            else:
                bill_sponsor = sponsor_dirty

            # find the table with bill details...it'll be useful later
            bill_details_tbl = bill_page.table.nextSibling.nextSibling

            bill_sponsor_link = None
            if bill_details_tbl.a:
                bill_sponsor_link = bill_details_tbl.a['href']

            bill.add_sponsor('primary',
                             bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # check for cosponsors
            cosponsor_cell = bill_details_tbl.find(
                text=re.compile("CoSponsor")).next

            if cosponsor_cell.a:
                self.parse_house_cosponsors(bill, cosponsor_cell)

            # parse out all the actions
            actions_link_tag = bill_page.find('a',
                                              text='ACTIONS').previous.previous

            actions_link = actions_link_tag['href']
            actions_link = re.sub("content", "print", actions_link)
            self.parse_house_actions(bill, actions_link)

            # get bill versions
            version_tags = bill_page.findAll(href=re.compile("biltxt"))
            if version_tags:
                for version_tag in version_tags:
                    if version_tag.b:
                        version = clean_text(version_tag.b.contents[0])
                        text_url = version_tag['href']
                        pdf_url = version_tag.previousSibling
                        pdf_url = pdf_url.previousSibling['href']
                        bill.add_version(version, text_url, pdf_url=pdf_url)

        self.save_bill(bill)
コード例 #35
0
    mlp.train(df[fieldname].values, df[labelname].values)
    return mlp


if __name__ == '__main__':
    pos_neg_ratios = Counter()
    reviews = pd.read_csv("reviews.csv", encoding="utf-8")
    ratings = pd.read_csv("ratings.csv", encoding="utf-8")
    review_ratings = pd.merge(reviews, ratings)

    review_ratings["binary_ratings"] = review_ratings["ratings"].apply(
        lambda x: POSITIVE if x > 3 else NEGATIVE)
    review_ratings = review_ratings[
        review_ratings["unixReviewTime"] > 1403913600]

    review_ratings = clean_text(review_ratings, "summary")

    print(review_ratings.shape)
    mlp = run_network(review_ratings, "summary", "binary_ratings")
    prediction = mlp.predict("good")
    print(prediction)
    prediction = mlp.predict("Bad")
    print(prediction)

    # filter_punctuations = lambda text: "".join(list(filter(lambda x: x not in string.punctuation, str(text))))
    #
    #
    # def clean_text_field(df, fieldname):
    #     df[fieldname] = df[fieldname].str.lower()
    #     df[fieldname] = df[fieldname].apply(filter_punctuations)
    #     df.dropna(subset=[fieldname], inplace=True)
コード例 #36
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self._senate_base_url, url)

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]')
        if len(bill_id) == 0:
            self.log("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath(
            '//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath('//table/tr')
        # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == 'LR Number:'
        bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if table_rows[4 + cosponsorOffset][0].text_content().strip(
        ) == 'Governor Action:':
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == 'Bill String:'
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.log("With subjects for this bill")

        self.log(bid)

        if bill_desc == "":
            print("ERROR: Blank title. Skipping. {} / {} / {}".format(
                bill_id, bill_desc, official_title))
            # XXX: Some pages full of blank bills.
            return

        bill = Bill(session,
                    'lower',
                    bill_id,
                    bill_desc,
                    bill_url=url,
                    bill_lr=bill_lr,
                    official_title=official_title,
                    type=bill_type,
                    subjects=subs)
        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        try:
            bill_sponsor_link = table_rows[0][1][0].attrib['href']
        except IndexError:
            return

        if bill_sponsor_link:
            bill_sponsor_link = '%s%s' % (self._senate_base_url,
                                          bill_sponsor_link)

        bill.add_sponsor('primary',
                         bill_sponsor,
                         sponsor_link=bill_sponsor_link)

        # check for cosponsors
        if cosponsorOffset == 1:
            if len(table_rows[2][1]) == 1:  # just a name
                cosponsor = table_rows[2][1][0]
                bill.add_sponsor(
                    'cosponsor',
                    cosponsor.text_content(),
                    sponsor_link='%s/%s' %
                    (self._senate_base_url, cosponsor.attrib['href']))
            else:  # name ... etal
                try:
                    cosponsor = table_rows[2][1][0]
                    bill.add_sponsor(
                        'cosponsor',
                        clean_text(cosponsor.text_content()),
                        sponsor_link='%s/%s' %
                        (self._senate_base_url, cosponsor.attrib['href']))
                    sponsors_url, = bill_page.xpath(
                        "//a[contains(@href, 'CoSponsors.aspx')]/@href")
                    self._parse_cosponsors_from_bill(bill, sponsors_url)
                except scrapelib.HTTPError as e:
                    self.log("WARNING: " + str(e))
                    self._bad_urls.append(url)
                    self.log("WARNING: no bill summary page (%s)" % url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._senate_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        actions_link, = bill_page.xpath(
            "//a[contains(@href, 'BillActions.aspx')]/@href")
        self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocsSection"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = '%s%s' % (self._senate_base_url,
                                 doc_tag[0].attrib['href'])
            bill.add_document(doc, text_url, mimetype="text/html")

        # get bill versions
        version_tags = bill_page.xpath(
            '//div[@class="BillDocsSection"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == 'PDF':
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version(version,
                                 vurl.attrib['href'],
                                 on_duplicate='use_new',
                                 mimetype=mimetype)
        self.save_bill(bill)
コード例 #37
0
| Repository   | Description       |
|:-------------|:------------------|""" % (lang, lang, lang_git)

print(header)

print(header_trendy)
fname = "data/%s-github-trendy.json" % lang
with open(fname) as f:
    for line in f:
        package = json.loads(line)
        if "description" not in package or "name" not in package or "url" not in package or not package[
                "name"] or not package["url"]:
            continue
        if package["description"] is None:
            package["description"] = ""
        row = "| [%s](%s) | %s |" % (clean_text(
            package["name"]), package["url"], clean_text(
                package["description"]))
        print(row)

print(header_packages)
fname = "data/%s.json" % lang
with open(fname) as f:
    for line in f:
        package = json.loads(line)
        if "description" not in package or "name" not in package or "url" not in package or not package[
                "name"] or not package["url"]:
            continue
        if package["description"] is None:
            package["description"] = ""
        row = "| [%s](%s) | %s |" % (clean_text(
            package["name"]), package["url"], clean_text(
コード例 #38
0
def re_clean(df):
    text = df['comment_clean'].tolist()
    cleaned_text = clean_text(text)
    df.loc[:,'reviews'] = cleaned_text
    return df
コード例 #39
0
def cleaning(tweet):
    return sentence_spell_check(cont.fix(clean_text(tweet)))
コード例 #40
0
ファイル: bills.py プロジェクト: PamelaM/openstates
    def parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self.senate_base_url,url)

        with self.urlopen(url) as bill_page:
            bill_page = lxml.html.fromstring(bill_page)

            bill_id = bill_page.xpath('//*[@class="entry-title"]')
            if len(bill_id) == 0:
                print "WARNING: bill summary page is blank! (%s)" % url
                self.bad_urls.append(url)
                return
            bill_id = bill_id[0].text_content()
            bill_id = clean_text(bill_id)

            bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content()
            bill_desc = clean_text(bill_desc)

            table_rows = bill_page.xpath('//table/tr')
            # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor:
            cosponsorOffset = 0
            if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
                cosponsorOffset = 1
                
            lr_label_tag = table_rows[3+cosponsorOffset]
            assert lr_label_tag[0].text_content().strip() == 'LR Number:'
            bill_lr = lr_label_tag[1].text_content()

            lastActionOffset = 0
            if table_rows[4+cosponsorOffset][0].text_content().strip() == 'Governor Action:':
                lastActionOffset = 1
            official_title_tag = table_rows[5+cosponsorOffset+lastActionOffset]
            assert official_title_tag[0].text_content().strip() == 'Bill String:'
            official_title = official_title_tag[1].text_content()

            # could substitute the description for the name,
            # but keeping it separate for now.
            bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title)
            bill.add_source(url)

            bill_sponsor = clean_text(table_rows[0][1].text_content())
            bill_sponsor_link = table_rows[0][1][0].attrib['href']
            if bill_sponsor_link:
                bill_sponsor_link = '%s%s' % (self.senate_base_url,bill_sponsor_link)

            bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link)

            # check for cosponsors
            if cosponsorOffset == 1:
                if len(table_rows[2][1]) == 1: # just a name
                    cosponsor = table_rows[2][1][0]
                    bill.add_sponsor('cosponsor', cosponsor.text_content(), sponsor_link='%s/%s' % (self.senate_base_url,cosponsor.attrib['href']))
                else: # name ... etal
                    try:
                        cosponsor = table_rows[2][1][0]
                        bill.add_sponsor('cosponsor', clean_text(cosponsor.text_content()), sponsor_link='%s/%s' % (self.senate_base_url,cosponsor.attrib['href']))
                        self.parse_cosponsors_from_bill(bill,'%s/%s' % (self.senate_base_url,table_rows[2][1][1].attrib['href']))
                    except scrapelib.HTTPError:
                        self.bad_urls.append(url)
                        print "WARNING: no bill summary page (%s)" % url

            actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
            actions_link = '%s/%s' % (self.senate_base_url,actions_link_tag.attrib['href'])
            actions_link = re.sub("content", "print", actions_link)
            self.parse_house_actions(bill, actions_link)

            # get bill versions
            version_tags = bill_page.xpath('//div[@class="BillDocsSection"][2]/span')
            for version_tag in reversed(version_tags):
                version = clean_text(version_tag.text_content())
                text_url = '%s%s' % (self.senate_base_url,version_tag[0].attrib['href'])
                pdf_url = '%s%s' % (self.senate_base_url,version_tag[1].attrib['href'])
                bill.add_version(version, text_url, pdf_url=pdf_url)

        self.save_bill(bill)
コード例 #41
0
 def __clean_text(self, text):
     return utils.clean_text(text)
コード例 #42
0
def main(argv):
    IMAGE_DIRECTORY = '/images'
    CSV_FILE_PATH = 'data.csv'
    num_epochs = 250000
    BATCH_SIZE = 1
    img_shape = (224, 224, 3) # Reduce based on RAM
    GRAPH_THRESHOLD = 0.5
    LEARNING_RATE = 1.6192e-05
    
    LOSS_WEIGHTS = [0.6, 0.2, 0.2] #Give importance to classification, semantic and gap loss respectively.
    IMAGE_ENCODER = 'resnet50'
    TEXT_ENCODER = 'bert'

    

    try:
        opts, args = getopt.getopt(argv,"i:t:b:",["image_encoder=","text_encoder=","batch_size="])
    except getopt.GetoptError:
        print('test -i <image_folder> -c <csv_filename>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -o <outputfile>')
            sys.exit()
        elif opt in ("-i", "--image_encoder"):
            IMAGE_ENCODER = arg
        elif opt in ("-c", "--text_encoder"):
            TEXT_ENCODER = arg
        elif opt in ("-b", "--batch_size"):
            BATCH_SIZE = int(arg)
            print("Set batch_size to %d" %BATCH_SIZE)
        
    
    
    df = pd.read_csv(CSV_FILE_PATH)
    num_samples = df.shape[0]

    class_names = df.classes.unique()

    ## CONVERT TO CATEGORICAL
    temp = list(df.classes)
    training_class_intmap = temp.copy()

    ### map each color to an integer
    mapping = {}

    for x in range(len(class_names)):
        mapping[class_names[x]] = x
    
    # integer representation
    for x in range(df.shape[0]):
        training_class_intmap[x] = mapping[training_class_intmap[x]]

    training_classes = tf.keras.utils.to_categorical(training_class_intmap)
    image_names = df.image
    text_list = df.text

    text_list = utils.clean_text(text_list)
    num_classes = len(class_names)

    adj_graph_classes = utils.get_adj_graph(class_names)

    if (IMAGE_ENCODER=='resnet50'):
        image_embedding_extractor_model = encoder.get_resnet50(img_shape)
        image_encoder_size = 2048
    elif (IMAGE_ENCODER=='resnet101'):
        image_embedding_extractor_model = encoder.get_resnet101(img_shape)
    if (TEXT_ENCODER=='bert'):
        bert_embedding = BertEmbedding()
        text_encoder_size = 768
    
    complete_model = build_model(image_encoder_size , text_encoder_size, num_classes)

    train_loss_results = []
    # train_accuracy_results = []


    optimizer = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE) #Define the optimize and specify the learning rate

    


    for epoch in range(num_epochs):
        epoch_loss_avg = tf.keras.metrics.Mean()
        # epoch_accuracy = tf.keras.metrics.CategoricalAccuracy() #Uncomment if you want to track
        # Training loop - using batches of 1024
        # encode_and_pack_batch(batch_size, image_encoder, text_encoder, image_names, text_list, training_classes, img_shape):
        xi1 , xt1,xi2 , xt2, y1, y2  = utils.encode_and_pack_batch(BATCH_SIZE, image_embedding_extractor_model,  bert_embedding ,image_names, text_list, training_classes, img_shape)
        x1 = [xi1, xt1]
        x2 = [xi2, xt2]
        # Optimize the model
        loss_value, grads = grad(complete_model, x1,x2,y1, y2, LOSS_WEIGHTS,GRAPH_THRESHOLD, adj_graph_classes)
        optimizer.apply_gradients(zip(grads, complete_model.trainable_variables))

        # Track progress
        epoch_loss_avg.update_state(loss_value)  # Add current batch loss
        

        # End epoch
        train_loss_results.append(epoch_loss_avg.result())

        if epoch % 5 == 0:
            print("Epoch {:03d}: Loss: {:.3f}".format(epoch,epoch_loss_avg.result()))
コード例 #43
0
def predict(config):
    input = clean_text(config.input)
    params = Params('config/params.json')

    # load tokenizer and torchtext Fields
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_kor = open('pickles/kor.pickle', 'rb')
    kor = pickle.load(pickle_kor)
    pickle_eng = open('pickles/eng.pickle', 'rb')
    eng = pickle.load(pickle_eng)
    eos_idx = eng.vocab.stoi['<eos>']
    print(eos_idx)
    print(eng.vocab.itos[eos_idx])

    # select model and load trained model
    model = Transformer(params)
    model.load_state_dict(torch.load(params.save_model))
    model.to(params.device)
    model.eval()

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(input)
    indexed = [kor.vocab.stoi[token] for token in tokenized]

    source = torch.LongTensor(indexed).unsqueeze(0).to(params.device)  # [1, source_len]: unsqueeze to add batch size
    target = torch.zeros(1, params.max_len).type_as(source.data)       # [1, max_len]

    encoder_output = model.encoder(source)
    next_symbol = eng.vocab.stoi['<sos>']

    for i in range(0, params.max_len):
        if next_symbol == eos_idx:
            break
        target[0][i] = next_symbol
        print(target[0][i])
        decoder_output, _ = model.decoder(target, source, encoder_output)  # [1, target length, output dim]
        prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1]
        print(prob)
        next_word = prob.data[i]
        print(next_word)
        next_symbol = next_word.item()

    
    target[0][10] = 3
    print(target.shape)
    print(target[0][34])
    eos_idx = torch.where(target[0] == eos_idx)[0][0]
    eos_idx = eos_idx.item()
    print(eos_idx)
    target = target[0][:eos_idx].unsqueeze(0)

    # translation_tensor = [target length] filed with word indices
    target, attention_map = model(source, target)
    target = target.squeeze(0).max(dim=-1)[1]

    translated_token = [eng.vocab.itos[token] for token in target]
    print(translated_token)
    #translation = translated_token[:translated_token.index('<eos>')]
    #translation = ''.join(translation)
    translation = ''.join(translated_token)

    print(f'question> {config.input}')
    print(f'reply> {translation}')
    display_attention(tokenized, translated_token, attention_map[4].squeeze(0)[:-1])
コード例 #44
0
ファイル: dataload.py プロジェクト: yolearn/tweet-sentiment
            'ids' : torch.tensor(ids, dtype=torch.long),
            'mask_ids' : torch.tensor(mask_ids, dtype=torch.long),
            'token_type_ids' : torch.tensor(token_type_ids, dtype=torch.long),
            'target_start_idx' : torch.tensor(target_start_idx, dtype=torch.long),
            'target_end_idx' : torch.tensor(target_end_idx, dtype=torch.long),
            'offsets' : torch.tensor(offsets, dtype=torch.long),
            'orig_sentiment' : sentiment,
            'orig_sele_text' : selected_text,
            'orig_text' : text,
            'targ_sentiment' : torch.tensor(targ_sentiment, dtype=torch.long)
        }


if __name__ == "__main__":
    trn_df = pd.read_csv(config.TRAIN_FILE)
    trn_df['text'] = trn_df['text'].apply(lambda x:clean_text(x))
    trn_df['selected_text'] = trn_df['selected_text'].apply(lambda x:clean_text(x))

    dataset = TweetDataset(trn_df['text'].values, 
                    trn_df['selected_text'].values, 
                    trn_df['sentiment'].values, 
                    config.TOKENIZER, 
                    config.MAX_LEN,
                    config.MODEL_VERSION
                )            
    
    for i in range(len(trn_df)):
        try:
            dataset[i]
        except:
            print(i)
def get_data_features(max_seq_len, embedding_file, batch_size):
    """
    Args:
        max_seq_len: Max sequence length of the sentences
        embedding_file: Embedding file
        batch_size: Batch size for the DataLoader

    Output:
        embedding_dim, word_index, embedding_matrix, X_train, y_train, X_test, y_test
    """

    #Load data
    train, val, test, features_train, features_val, features_test = load_data_features(
    )

    #Embedding dimension based on the embedding_file
    embedding_dim = int(re.findall('\d{3,}', embedding_file)[0])

    #Clean data
    X_train = [
        clean_text(text,
                   remove_punt_number_special_chars=True,
                   remove_stopwords=True,
                   apply_stemming=False) for text in train["text"]
    ]
    X_val = [
        clean_text(text,
                   remove_punt_number_special_chars=True,
                   remove_stopwords=True,
                   apply_stemming=False) for text in val["text"]
    ]
    X_test = [
        clean_text(text,
                   remove_punt_number_special_chars=True,
                   remove_stopwords=True,
                   apply_stemming=False) for text in test["text"]
    ]

    y_train = encode_label(train["label"])
    y_val = encode_label(val["label"])
    y_test = encode_label(test["label"])

    tokenizer = Tokenizer(num_words=10000000)
    tokenizer.fit_on_texts(list(X_train) + list(X_val))

    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1

    #Embeddings
    embeddings_index = load_glove(embedding_file)
    embedding_matrix = create_weight_matrix(vocab_size, word_index,
                                            embedding_dim, embeddings_index)

    X_train = tokenizer.texts_to_sequences(X_train)
    X_train = pad_sequences(X_train, maxlen=max_seq_len)

    X_val = tokenizer.texts_to_sequences(X_val)
    X_val = pad_sequences(X_val, maxlen=max_seq_len)

    X_test = tokenizer.texts_to_sequences(X_test)
    X_test = pad_sequences(X_test, maxlen=max_seq_len)

    train_dataloader = get_dataloader_features(X_train, features_train,
                                               y_train, batch_size)
    val_dataloader = get_dataloader_features(X_val, features_val, y_val,
                                             batch_size)
    test_dataloader = get_dataloader_features(X_test, features_test, y_test,
                                              batch_size)

    return embedding_dim, int(
        vocab_size
    ), embedding_matrix, train_dataloader, val_dataloader, test_dataloader
コード例 #46
0
directory = '../scraper/'
data_files = []
json_file_item = None
with open('../artifacts/anon_dict.json') as json_file:
    json_file_item = json.load(json_file)

analysis = ""
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.csv') & (True == filename.startswith('results_')):
        data_files.append(pd.read_csv(directory+filename))
        analysis = filename.split('.csv')[0]

df = pd.concat(data_files, sort=False)

df['clean_text'] = df['text'].map(lambda x: clean_text(x))

df['date'] = df['timestamp'].apply(lambda x: transform_date(x))
df['year'] = df['date'].apply(lambda x: x.year)
df = df.loc[df['year'] >= df['year'].max(), ]

df['hashtags'] = df['text'].map(lambda x: get_hashtags_operations(x))

terms_attacks = json_file_item["attacks"]

df['attack'] = df['clean_text'].map(lambda x: check_attack(x, terms_attacks))
df['operations'] = df['hashtags'].map(lambda x: True if len(
    [hashtag for hashtag in x if '#op' == hashtag[:3]]) > 0 else False)
df['RT'] = df['clean_text'].map(lambda x: True if 'rt' in x else False)

# Translate RTs to Attacks
コード例 #47
0
ファイル: bills.py プロジェクト: Hasimir/sunlight-openstates
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self._senate_base_url,url)

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]')
        if len(bill_id) == 0:
            self.log("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath('//table/tr')
        # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
            cosponsorOffset = 1

        lr_label_tag = table_rows[3+cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == 'LR Number:'
        bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if table_rows[4+cosponsorOffset][0].text_content().strip() == 'Governor Action:':
            lastActionOffset = 1
        official_title_tag = table_rows[5+cosponsorOffset+lastActionOffset]
        assert official_title_tag[0].text_content().strip() == 'Bill String:'
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.log("With subjects for this bill")

        self.log(bid)

        if bill_desc == "":
            print("ERROR: Blank title. Skipping. {} / {} / {}".format(
                bill_id, bill_desc, official_title
            ))
            # XXX: Some pages full of blank bills.
            return

        bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url,
                    bill_lr=bill_lr, official_title=official_title,
                    type=bill_type, subjects=subs)
        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        try:
            bill_sponsor_link = table_rows[0][1][0].attrib['href']
        except IndexError:
            return

        if bill_sponsor_link:
            bill_sponsor_link = '%s%s' % (self._senate_base_url,bill_sponsor_link)

        bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link)

        # check for cosponsors
        if cosponsorOffset == 1:
            if len(table_rows[2][1]) == 1: # just a name
                cosponsor = table_rows[2][1][0]
                bill.add_sponsor('cosponsor', cosponsor.text_content(),
                                 sponsor_link='%s/%s' % (
                                     self._senate_base_url,
                                     cosponsor.attrib['href']
                                ))
            else: # name ... etal
                try:
                    cosponsor = table_rows[2][1][0]
                    bill.add_sponsor('cosponsor',
                                     clean_text(cosponsor.text_content()),
                                     sponsor_link='%s/%s' % (
                                         self._senate_base_url,
                                         cosponsor.attrib['href']
                                     ))
                    sponsors_url, = bill_page.xpath(
                        "//a[contains(@href, 'CoSponsors.aspx')]/@href")
                    self._parse_cosponsors_from_bill(bill, sponsors_url)
                except scrapelib.HTTPError as e:
                    self.log("WARNING: " + str(e))
                    self._bad_urls.append(url)
                    self.log( "WARNING: no bill summary page (%s)" % url )

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._senate_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        actions_link, = bill_page.xpath(
            "//a[contains(@href, 'BillActions.aspx')]/@href")
        self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocsSection"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = '%s%s' % (
                self._senate_base_url,
                doc_tag[0].attrib['href']
            )
            bill.add_document(doc, text_url,
                              mimetype="text/html")

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocsSection"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == 'PDF':
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version(version, vurl.attrib['href'],
                                 on_duplicate='use_new', mimetype=mimetype)
        self.save_bill(bill)
コード例 #48
0
ファイル: test.py プロジェクト: Chitti-007/New_Mood-Emotion
    audio = r.listen(source)

print("Loading vocab2int")
vocab2int = pickle.load(open("Mood:Emotion Code/data/vocab2int.pickle", "rb"))

model = get_model_emotions(len(vocab2int),
                           sequence_length=sequence_length,
                           embedding_size=embedding_size)
model.load_weights("results/model_v1_0.59_0.76.h5")

if __name__ == "__main__":
    import argparse
    # parser = argparse.ArgumentParser(description="Emotion classifier using text")
    # parser.add_argument("text", type=str, help="The text you want to analyze")

    # args = parser.parse_args()

    text = tokenize_words(clean_text(r.recognize_google(audio)), vocab2int)
    x = pad_sequences([text], maxlen=sequence_length)
    prediction = model.predict_classes(x)[0]

    probs = model.predict(x)[0]
    # print("hi:",index)
    print("Question asked: ", Textlist[index])
    print("You said: " + r.recognize_google(audio))
    print("Probs:")
    for i, category in categories.items():
        print(f"{category.capitalize()}: {probs[i]*100:.2f}%")

    print("The most dominant emotion:", categories[prediction])
コード例 #49
0
vocab_size = 2000
batch_size = 32
lr = 0.001
epochs = 500

n_chars = 500  # to generate
temperature = 0.6

text_file = 'TheHitchhikersGuide.txt'

# spm.SentencePieceTrainer.Train(f'--input={text_file} --model_prefix=tokens --vocab_size={vocab_size}')
sp = spm.SentencePieceProcessor()
sp.Load("tokens.model")

text = open(text_file, 'rb').read().decode(encoding='utf-8')
text = clean_text(text)
text_as_int = np.array(sp.EncodeAsIds(text))

model = TransformerCharLM(vocab=vocab_size,
                          d_model=384,
                          n_heads=6,
                          n_encoder_layers=10,
                          d_ff=2048,
                          dropout=0.1)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

try:
    for epoch in range(1, epochs + 1):
コード例 #50
0
def abs_summ_api():
    if not request.json:
        abort(400)

    transcript  = request.get_json()['entries']
    id          = request.args.get('id', default=None, type=str)
    callbackurl = request.args.get('callbackurl', default=None, type=str)
    enc         = request.args.get('enc', default='utf-8', type=str)
    size        = request.args.get('nkeys', default=100, type=int)
    lang        = request.args.get('lang', default='fr', type=str)

    if id is None:
        abort(400)

    try:
        speakers = []
        utterances = []
        for item in transcript:
            speakers.append(item['speaker'])
            utterances.append(utils.clean_utterance(item['text'], resources[lang]['filler_words']))

        utterances_tagged = [
            ' '.join(['/'.join(t) for t in sent])
            for sent in resources[lang]['pos_tagger'].tag_sents([u.split() for u in utterances])
        ]

        data = zip(range(len(utterances)), speakers, utterances_tagged)
        communities = detection(data, resources[lang]['stopwords'], config)
        compressions, graphs = compression(communities, resources[lang]['stopwords'], resources[lang]['word_vectors'], resources[lang]['language_model'], config, lang)
        summary = selection(compressions, utterances, resources[lang]['stopwords'], resources[lang]['word_vectors'], config, size)

        # get CoreRank scores dict
        lists_of_terms = []
        for sentence in utterances:
            lists_of_terms.append(
                utils.clean_text(
                    copy.copy(sentence),
                    stopwords=resources[lang]['stopwords'],
                    remove_stopwords=config.getboolean('KEYWORDS', 'remove_stopwords'),
                    pos_filtering=config.getboolean('KEYWORDS', 'pos_filtering'),
                    stemming=config.getboolean('KEYWORDS', 'stemming'),
                    lower_case=True
                    # lower_case for CoreRank
                )
            )
        keywords = cr.get_core_rank_scores(
            lists_of_terms,
            window_size=config.getint('KEYWORDS', 'w'),
            overspanning=config.getboolean('KEYWORDS', 'overspanning'),
            weighted=config.getboolean('KEYWORDS', 'weighted')
        )

        if callbackurl is None:
            return jsonify({'summary': summary, 'keywords': keywords})
        else:
            r = requests.post(callbackurl, json={'summary': summary, 'keywords': keywords})
            if r.status_code == requests.codes.ok:
                return "summary produced succesfully for meeting " + id
            else:
                raise RuntimeError()
    except Exception as e:
        print e
        return "got exception trying to run process"
コード例 #51
0
ファイル: train.py プロジェクト: chellee886/qiqc

if __name__ == "__main__":
    config = Config("./config.json")

    data = Data(config)

    print("Getting training and testing data...")
    training_data = data.get_data(data_set_name="train.csv",
                                  is_train_data=True)
    testing_data = data.get_data(data_set_name="test.csv", is_train_data=False)
    # print(training_data["question_text"])

    print("Cleaning data...")
    training_data["question_text"] = training_data["question_text"].apply(
        lambda x: clean_text(x, MISPELL_DICT))
    testing_data["question_text"] = testing_data["question_text"].apply(
        lambda x: clean_text(x, MISPELL_DICT))
    # print(training_data["question_text"])

    print("Getting word embedding...")
    word2idx = data.word_to_idx(training_data)
    # print("before : {}".format(len(word2idx)))

    # print(len(word2idx))
    emb_dict = data.get_embedding_dict(embedding_name="newglove.840B.300d.txt",
                                       reset_embedding_table=False,
                                       word_set=word2idx.keys())
    # print("After : {}".format(len(emb_dict)))

    emb_table = data.get_embedding_table(word2idx=word2idx,
コード例 #52
0
    def parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self.senate_base_url, url)

        with self.urlopen(url) as bill_page:
            bill_page = lxml.html.fromstring(bill_page)

            bill_id = bill_page.xpath('//*[@class="entry-title"]')
            if len(bill_id) == 0:
                print "WARNING: bill summary page is blank! (%s)" % url
                self.bad_urls.append(url)
                return
            bill_id = bill_id[0].text_content()
            bill_id = clean_text(bill_id)

            bill_desc = bill_page.xpath(
                '//*[@class="BillDescription"]')[0].text_content()
            bill_desc = clean_text(bill_desc)

            table_rows = bill_page.xpath('//table/tr')
            # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor:
            cosponsorOffset = 0
            if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
                cosponsorOffset = 1

            lr_label_tag = table_rows[3 + cosponsorOffset]
            assert lr_label_tag[0].text_content().strip() == 'LR Number:'
            bill_lr = lr_label_tag[1].text_content()

            lastActionOffset = 0
            if table_rows[4 + cosponsorOffset][0].text_content().strip(
            ) == 'Governor Action:':
                lastActionOffset = 1
            official_title_tag = table_rows[5 + cosponsorOffset +
                                            lastActionOffset]
            assert official_title_tag[0].text_content().strip(
            ) == 'Bill String:'
            official_title = official_title_tag[1].text_content()

            # could substitute the description for the name,
            # but keeping it separate for now.
            bill = Bill(session,
                        'lower',
                        bill_id,
                        bill_desc,
                        bill_url=url,
                        bill_lr=bill_lr,
                        official_title=official_title)
            bill.add_source(url)

            bill_sponsor = clean_text(table_rows[0][1].text_content())
            bill_sponsor_link = table_rows[0][1][0].attrib['href']
            if bill_sponsor_link:
                bill_sponsor_link = '%s%s' % (self.senate_base_url,
                                              bill_sponsor_link)

            bill.add_sponsor('primary',
                             bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # check for cosponsors
            if cosponsorOffset == 1:
                if len(table_rows[2][1]) == 1:  # just a name
                    cosponsor = table_rows[2][1][0]
                    bill.add_sponsor(
                        'cosponsor',
                        cosponsor.text_content(),
                        sponsor_link='%s/%s' %
                        (self.senate_base_url, cosponsor.attrib['href']))
                else:  # name ... etal
                    try:
                        cosponsor = table_rows[2][1][0]
                        bill.add_sponsor(
                            'cosponsor',
                            clean_text(cosponsor.text_content()),
                            sponsor_link='%s/%s' %
                            (self.senate_base_url, cosponsor.attrib['href']))
                        self.parse_cosponsors_from_bill(
                            bill,
                            '%s/%s' % (self.senate_base_url,
                                       table_rows[2][1][1].attrib['href']))
                    except scrapelib.HTTPError:
                        self.bad_urls.append(url)
                        print "WARNING: no bill summary page (%s)" % url

            actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
            actions_link = '%s/%s' % (self.senate_base_url,
                                      actions_link_tag.attrib['href'])
            actions_link = re.sub("content", "print", actions_link)
            self.parse_house_actions(bill, actions_link)

            # get bill versions
            version_tags = bill_page.xpath(
                '//div[@class="BillDocsSection"][2]/span')
            for version_tag in reversed(version_tags):
                version = clean_text(version_tag.text_content())
                text_url = '%s%s' % (self.senate_base_url,
                                     version_tag[0].attrib['href'])
                pdf_url = '%s%s' % (self.senate_base_url,
                                    version_tag[1].attrib['href'])
                bill.add_version(version, text_url, pdf_url=pdf_url)

        self.save_bill(bill)
コード例 #53
0
    with open(
            f"mlruns/{args.EXPERIMENT_ID}/{args.RUN_ID}/artifacts/files/x_char_encoder",
            "rb") as infile:
        x_char_encoder = dill.load(infile)

    with open(
            f"mlruns/{args.EXPERIMENT_ID}/{args.RUN_ID}/artifacts/files/y_ner_encoder",
            "rb") as infile:
        y_ner_encoder = dill.load(infile)

    with open(
            f"mlruns/{args.EXPERIMENT_ID}/{args.RUN_ID}/artifacts/files/tag_to_index",
            "rb") as infile:
        tag_to_index = dill.load(infile)

    X_text = clean_text(args.DATA_TEXT)
    X_text_list_as_is = [X_text.split(' ')]
    X_text_list = [[word.lower() for word in lst] for lst in X_text_list_as_is]

    X_tags, tag_to_index_infer = get_POS_tags(X_text_list)
    X_text_list = trim_list_of_lists_upto_max_len(X_text_list,
                                                  max_sentence_len)
    X_text_list_as_is = trim_list_of_lists_upto_max_len(
        X_text_list_as_is, max_sentence_len)
    X_tags = trim_list_of_lists_upto_max_len(X_tags, max_sentence_len)

    alnum, numeric, alpha, digit, lower, title, ascii = enrich_data(
        X_text_list_as_is)

    alnum = pad_and_stack_list_of_list(
        alnum,
コード例 #54
0
                            'CPU': 1,
                            'GPU': 0
                        })

from model import get_model_5stars
from utils import clean_text, tokenize_words
from config import embedding_size, sequence_length
from keras.preprocessing.sequence import pad_sequences

import pickle

vocab2int = pickle.load(open("data/vocab2int.pickle", "rb"))
model = get_model_5stars(len(vocab2int),
                         sequence_length=sequence_length,
                         embedding_size=embedding_size)

model.load_weights("results/model_V20_0.38_0.80.h5")

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Food Review evaluator")
    parser.add_argument("review",
                        type=str,
                        help="The review of the product in text")
    args = parser.parse_args()

    review = tokenize_words(clean_text(args.review), vocab2int)
    x = pad_sequences([review], maxlen=sequence_length)

    print(f"{model.predict(x)[0][0]:.2f}/5")