def save_data(data): """ Save the data scrapped by generating a .csv file """ flatten = lambda x: list(chain.from_iterable(x)) df = pd.DataFrame(flatten(data)) df = process(df) df.to_csv('./data/fifa21.csv', index=False)
def sample(checkpoint, length, lstm_size, start=""): """ 生成新文本 checkpoint: 某一轮迭代的参数文件 length: 新文本的字符长度 lstm_size: 隐层结点数 start: 起始文本 """ data, word2int, int2word, vocab = process(FLAGS.file_path) with open("./output/w2i.txt", "w") as f: f.write(str(word2int)) with open("./output/i2w.txt", "w") as f: f.write(str(int2word)) with open("./output/vocab.txt", "w") as f: f.write(str(vocab)) pattern = re.compile("[\u4e00-\u9fa5]") match = re.search(pattern, start) while (match is None): start = int2word[np.random.random_integers(7, len(vocab) - 1)] match = re.search(pattern, start) print("随机起始文字:%s" % start) content = [start] # sampling=True意味着batch的size=1 x 1 model = WordRNN(len(vocab), lstm_size=lstm_size, sampling=True) saver = tf.train.Saver() with tf.Session() as sess: # 加载模型参数,恢复训练 saver.restore(sess, checkpoint) new_state = sess.run(model.initial_state) x = np.zeros((1, 1)) w = word2int[start] # 不断生成字符,直到达到指定数目 for i in range(length): x[0, 0] = w feed = { model.inputs: x, model.keep_prob: 1., model.initial_state: new_state } preds, new_state = sess.run([model.prediction, model.final_state], feed_dict=feed) # idx = np.argmax(preds[0]) w = pick_top_n(preds, len(vocab)) content.append(int2word[w]) return ''.join(content)
def save(data): for item in data: if not 'committee' in item: continue d = None if 'date' in item: d = item['date'] elif 'end' in item: d = item['end'] elif 'time' in item: d = item['time'] if not isinstance(d, str): d = str(d) id = item['committee'] + d + str(item['seq_no']) item['id'] = id process(item, id, db.comagenda, 'ep_comagendas', id + ' - ' + item['title'], onchanged=onchanged) return data
def scrape(url, **kwargs): log(3,"scraping %s" % (url)) root = getXML(url) if root is None: log(1,"could not get votes for", url) return # angrily o/ log(3, "processing plenary votes xml from %s" % url) # root is: #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443" votes=[] for vote in root.xpath('//RollCallVote.Result'): # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/ time = vote.get('Date') if len(time.split()) == 2: ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") else: ts = datetime.strptime(time, "%Y-%m-%d") tmp=vote.get('Identifier') if tmp: voteid = int(tmp) else: tmp = vote.get('Number') if not tmp: log(1, "blimey, could not deduce an id for the vote in %s" % url) raise ValueError("no id for vote in %s" % url) voteid = "%s-%s" % (ts,tmp) title = vote.xpath("RollCallVote.Description.Text") if len(title) != 1: log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url)) title="!unknown!" else: title=junws(title[0]) v={u"ts": ts, u"url": url, u"voteid": voteid, u"title": title, 'votes':{}} v.update(votemeta(v['title'], v['ts'])) if 'epref' not in v: ref = vote.xpath("RollCallVote.Description.Text/a/text()") if ref: v['epref']=unws(ref[0]) for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]: type = vote.xpath(type) if not type: continue if len(type)>1: log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url)) type = type[0] v['votes'][stype]={'total': int(type.get('Number')), 'groups': {}} for group in type.xpath('Result.PoliticalGroup.List'): g = str(group.get('Identifier')) if not g in v['votes'][stype]['groups']: v['votes'][stype]['groups'][g]=[] for tag in ['Member.Name', 'PoliticalGroup.Member.Name']: for mep in group.xpath(tag): m = {} name = junws(mep) mepid = mep.get("PersId") if mepid: mepid = int(mepid) else: mepid = db.getMep(name, v['ts'], abbr=g) if mepid: m['mepid']= mepid #if int(mep.get('MepId')) in ambiguous_meps: # oid = int(mep.get('MepId')) # ambiguous_meps.remove(oid) # log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid)) else: mepid = lost_meps.get(mep.get('MepId')) if mepid: m['mepid']= mepid else: m['name']= name m['obscure_id']=int(mep.get('MepId')) # it's a totally useless and confusing id that is nowhere else used v['votes'][stype]['groups'][g].append(m) # save process(v, v['voteid'], db.vote, 'ep_votes', v['title']) votes.append(v) return votes
def scrape(id, terms, mepname, **kwargs): activity_types = ( ('plenary-speeches', 'CRE'), ('reports', "REPORT"), ('reports-shadow', "REPORT-SHADOW"), ('opinions', "COMPARL"), ('opinions-shadow', "COMPARL-SHADOW"), ('motions-instit', "MOTION"), ('oral-questions', "OQ"), # other activities ('written-explanations', 'WEXP'), ('major-interpellations', 'MINT'), ('written-questions', "WQ"), ('motions-indiv', "IMOTION"), ('written-declarations', "WDECL"), ) activities = {} for type, TYPE in activity_types: for term in terms: page = 0 cnt = 20 url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % ( id, type, term, page, cnt) try: root = fetch(url) except: log(1, "failed to fetch {}".format(url)) raise ValueError #continue #print(url, file=sys.stderr) while (len(root.xpath('//div[@class="erpl_document"]')) > 0): for node in root.xpath('//div[@class="erpl_document"]'): if type == 'written-explanations': item = { 'title': unws(''.join( node.xpath( './div/h3/span[@class="t-item"]//text()')) ), 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'text': unws(''.join(node.xpath('./div[2]/div//text()'))) } elif type == 'written-declarations': if len(node.xpath('./div[1]/div')) != 3: log( 2, "written decl item has not 3 divs but %d %s" % (len(node.xpath('./div[1]/div')), url)) continue if len(node.xpath('./div[1]/div[1]/span')) != 3: log( 2, "written decl item has not 3 but %d spans in the 1st div at %s" % (len(node.xpath('./div[1]/div[1]/span')), url)) continue item = { 'title': unws(''.join( node.xpath( './div/h3/span[@class="t-item"]//text()')) ), 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'id': unws(''.join( node.xpath('./div[1]/div[1]/span[2]/text()') [0])), 'status': unws(''.join( node.xpath('./div[1]/div[1]/span[3]/text()') [0])), 'formats': [{ 'type': unws(fnode.xpath('./span/text()')[0]), 'url': str(fnode.xpath('./@href')[0]), 'size': unws(fnode.xpath('./span/span/text()')[0]) } for fnode in node.xpath( './div[1]/div[2]/div[@class="d-inline"]/a')], 'authors': [{ 'name': name.strip(), "mepid": db.mepid_by_name(name.strip()) } for name in node.xpath( './div[1]/div[3]/span/text()')], } for info in node.xpath('./div[2]/div'): label = unws(''.join(info.xpath('./text()')))[:-2] value = unws(''.join(info.xpath('./span/text()'))) if 'date' in label.lower(): value = datetime.strptime(value, u"%d-%m-%Y") if label == 'Number of signatories': number, date = value.split(' - ') value = int(number) item["No of sigs date"] = datetime.strptime( date, u"%d-%m-%Y") item[label] = value else: #from lxml.etree import tostring #print('\n'.join(tostring(e).decode() for e in node.xpath('./div/div[1]'))) # all other activities share the following scraper ref = unws(''.join( node.xpath('./div[1]/div[1]/span[2]/text()'))) if ref.startswith('- '): ref = ref[2:] if ref.endswith(' -'): ref = ref[:-2] item = { 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'reference': ref, } if type not in ['written-questions', 'oral-questions']: ref = unws(''.join( node.xpath('./div[1]/div[1]/span[3]/text()'))) if ref: if not pere.match(ref): log( 2, "pe, has not expected format: '%s'" % ref) else: item['pe'] = ref # opinions don't have title urls... why would they? refurl = node.xpath('./div[1]/h3/a/@href') if refurl: item['url'] = str(refurl[0]) item['title'] = unws(''.join( node.xpath( './div/h3//span[@class="t-item"]//text()'))) abbr = node.xpath( './div[1]/div[1]/span/span[contains(concat(" ",normalize-space(@class)," ")," erpl_badge-committee ")]/text()' ) if len(abbr): item['committee'] = [ a for a in [unws(c) for c in abbr] if a ] formats = [] for fnode in node.xpath( './div[1]/div[2]/div[@class="d-inline"]/a'): elem = { 'type': unws(fnode.xpath('./span/text()')[0]), 'url': str(fnode.xpath('./@href')[0]) } tmp = fnode.xpath('./span/span/text()') if len(tmp) > 0: elem['size'] = unws(tmp[0]) formats.append(elem) if formats: item['formats'] = formats authors = [{ 'name': name.strip(), "mepid": db.mepid_by_name(name.strip()) } for name in node.xpath('./div[1]/div[3]/span/text()') ] if authors: item['authors'] = authors if type in ['opinions-shadow', 'opinions']: for f in item['formats']: if f['type'] == 'PDF': ref = pdf2ref(f['url']) if ref is not None: item['dossiers'] = [ref] break else: # try to deduce dossier from document reference dossiers = db.get('dossiers_by_doc', item['reference']) or [] if len(dossiers) > 0: item['dossiers'] = [ d['procedure']['reference'] for d in dossiers ] elif not '+DOC+PDF+' in item['url']: # try to figure out the associated dossier by making an (expensive) http request to the ep log( 4, "fetching primary activity page %s" % item['url']) try: refroot = fetch(item['url']) except: refroot = None if refroot is not None: if '/doceo/' in item[ 'url']: # stupid new EP site removed the span with the procedure, bastards. fulla = refroot.xpath( '//table[@class="buttondocwin"]//a/img[@src="/doceo/data/img/navi_moredetails.gif"]/..' ) if fulla: fullurl = fulla[0].get('href') if fullurl.endswith('.html'): if fullurl[-7:-5] != 'EN': fullurl = fullurl[:-7] + 'EN.html' log( 4, 'loading activity full text page %s' % fullurl) if fullurl.startswith( '/doceo'): fullurl = 'https://www.europarl.europa.eu' + fullurl if fullurl != item['url']: refroot = fetch(fullurl) else: log( 4, 'no fulla for %s' % item['url']) anchor = refroot.xpath( '//span[@class="contents" and text()="Procedure : " and not(ancestor::div[@style="display:none"])]' ) if len(anchor) == 1: dossier = anchor[0].xpath( "./following-sibling::a/text()") if len(dossier) == 1: item['dossiers'] = [ unws(dossier[0]) ] elif len(dossier) > 1: log( 2, "more than one dossier in ep info page: %d %s" % (len(dossier), item['url'])) elif len(anchor) > 1: log( 2, "more than one anchor in ep info page: %d %s" % (len(anchor), item['url'])) item['term'] = term if TYPE not in activities: activities[TYPE] = [] activities[TYPE].append(item) if len(root.xpath('//div[@class="erpl_document"]')) < cnt: break page += 1 url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % ( id, type, term, page, cnt) try: root = fetch(url) except: log(1, "failed to fetch {}".format(url)) #raise ValueError break #print(url, file=sys.stderr) if TYPE in activities: activities[TYPE] = sorted(activities[TYPE], key=lambda x: x['date']) activities['mep_id'] = id if len(activities.keys()) > 1: process(activities, id, db.activities, 'ep_mep_activities', mepname, nodiff=True) return activities return {}
def train(batch_size=10, seq_len=150, epochs=200): if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)): os.mkdir(os.path.dirname(FLAGS.checkpoints_dir)) if not os.path.exists(FLAGS.checkpoints_dir): os.mkdir(FLAGS.checkpoints_dir) data, word2int, int2word, vocab = process(FLAGS.file_path) with open("./output/vocabularies.txt", "w+") as f: f.write(str(vocab)) model = WordRNN(len(vocab), batch_size=batch_size, seq_len=seq_len, lstm_size=lstm_size, layer_count=layer_count, learning_rate=learning_rate) saver = tf.train.Saver(max_to_keep=100) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) start_epoch = 0 checkpoint = tf.train.latest_checkpoint('./checkpoints') if checkpoint: saver.restore(sess, checkpoint) print(checkpoint) print("[%s] 从checkpoints中恢复继续训练 {0}".format(checkpoint) % strdatetime()) pattern = re.compile("\./checkpoints/(\d+).*") start_epoch += int(re.match(pattern, checkpoint).group(1)) print('[%s] 开始训练...' % strdatetime()) for e in range(start_epoch, epochs): print("[%s]--------- 第%d轮(共%d轮) --------" % (strdatetime(), e + 1, epochs)) # Train network new_state = sess.run(model.initial_state) batch = 0 batch_count = int(len(data) / (batch_size * seq_len)) print("共计 %d 词语单元, %d 批次" % (len(data), batch_count)) for x, y in generate_batch(data, batch_size, seq_len): batch += 1 start = time.time() feed = { model.inputs: x, model.targets: y, model.keep_prob: keep_prob, model.initial_state: new_state } loss, new_state, _ = sess.run( [model.loss, model.final_state, model.optimizer], feed_dict=feed) end = time.time() # control the print lines # if counter % 100 == 0: print('[%s] 批次: %d , 时间: %.6fs, 误差: %.6f' % (strdatetime(), batch, end - start, loss)) if ((e + 1) % save_freq == 0 and (batch == batch_count or batch == 1 or batch == int(batch_count / 2))): saver.save( sess, "checkpoints/{}-{}-{}.ckpt".format( e + 1, batch, lstm_size))
def scrape(url, meps=None, **kwargs): prolog=True res=[] block=None reference=None date=None committee=[] text, PE=getraw(url) motion = False for line in text: #log(4,'line is: "%s"' % line) if prolog: line=unws(line) if not line: continue if amstart.match(line): if PE is None: log(1, "document has no PE id: %s" % url) if reference==None: log(1,"[!] couldn't find ref: %s" % (unws([x for x in text[:20] if unws(x)][2]))) # marking as scraped though if not motion: log(1, "couldn't find dossier reference in source pdf: %s" % url) #raise ValueError("No dossier reference in amendment: %s" % url) return log(3, "couldn't find dossier reference in source pdf, but was marked as motion: %s" % url) return if date==None or committee==[]: log(1,"[!] couldn't find date or committee: %s" % url) raise ValueError("No date or committee in amendment") block=[line] prolog=False continue if line == 'Draft motion for a resolution': log(4,"document is a draft motion for resolution") motion = True m = re.search(pere, line) if m: if PE is None: PE = m.group(0) log(4,"found PE reference: %s" % PE) line = unws(line.replace(PE,'')) log(4,'updated line is: "%s"' % line) if line in COMMITTEE_MAP: log(4,'found committee: "%s"' % line) committee.append(COMMITTEE_MAP[line]) continue m = re.search(refre, line) if (committee and not reference and m): reference=m.group(1) log(4,'found reference: "%s"' % reference) if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN': log(3, "adjusting reference to eudatap") reference="2012/0011(COD)" continue if (not date): try: date = parse(unws(line), dayfirst=True) log(4,'found date: "%s"' % line) except ValueError: pass except TypeError: pass continue if amstart.match(line): # parse block am=parse_block(block, url, reference, date, committee, meps, PE) if am is not None: process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True) res.append(am) block=[line] continue block.append(line) if block and filter(None,block): am = parse_block(block, url, reference, date, committee, meps, PE) if am is not None: process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True) res.append(am) log(3,"total amendments %d in %s" % (len(res),url)) return res
def process_feature(data, transform=lambda x: x, policy=lambda x: True): return process(data, users['msno'], indices, transform, policy)
def process_feature(data, transform=lambda x: x, policy=lambda x: True): return process(data, songs['song_id'], indices, transform, policy)
def scrape(id, **kwargs): # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id xml = fetch_raw(url) # we have to patch up the returned html... xml = xml.replace("</br>", "<br/>") # ...it contains some bad tags.. root = fromstring( xml ) # ...which make the lxml soup parser drop some branches in the DOM sidebar_check(root, url) mep = { 'UserID': id, 'Name': mangleName( unws(' '.join( root.xpath('//span[@class="sln-member-name"]/text()'))), id), 'Photo': "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id, 'meta': { 'url': url }, 'Twitter': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href' ) ], 'Homepage': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href' ) ], 'Facebook': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href' ) ], 'Instagram': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href' ) ], 'Mail': [ deobfus_mail(x) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href' ) ], 'Addresses': parse_addr(root), 'active': False, } mep = addchangednames(mep) birthdate = root.xpath('//time[@id="birthDate"]/text()') if len(birthdate) > 0: mep['Birth'] = { 'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y") } place = root.xpath('//time[@id="birthDate"]/following-sibling::text()') if len(place) > 0: tmp = unws(' '.join(place)) if tmp.startswith(", "): tmp = tmp[2:] mep['Birth']['place'] = tmp death = root.xpath('//time[@id="deathDate"]/text()') if death: mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y") body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if body.xpath('.//h1[text()="Curriculum vitae "]'): if not body.xpath('.//h3[@id="no_cv_available"]'): mep['CV'] = { 'updated': datetime.strptime( unws( body.xpath( './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()' )[0]), u"Updated: %d/%m/%Y") } mep['CV'].update({ unws(''.join(title.xpath(".//text()"))): [ unws(''.join(item.xpath(".//text()"))).replace( "-...", "- ...") for item in title.xpath("following-sibling::ul/li") ] for title in body.xpath('.//h4') #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ") }) # assistants url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id root = fetch(url) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants": for h4 in body.xpath('.//h4'): title = unws(''.join(h4.xpath(".//text()"))) assistants = [ unws(''.join(item.xpath(".//text()"))) for item in h4.xpath("../div//span") ] if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower().split()[0] if assistants: mep['assistants'][title] = assistants elif title in [ 'Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', 'Trainees', 'Paying agents (grouping)', 'Paying agents', 'Assistants to the Vice-Presidency/to the Quaestorate' ]: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower() if assistants: mep['assistants'][title] = assistants else: log(2, 'unknown title for assistants "{}" {}'.format(title, url)) raise ValueError # declarations root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" % id) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations": for title in body.xpath('.//h4'): key = unws(''.join(title.xpath('.//text()'))) if key == 'Declaration of financial interests': key = 'Financial Declarations' mep[key] = [] for pdf in title.xpath('./following-sibling::ul/li/a'): url = pdf.xpath('./@href')[0] try: mep[key].append(findecl.scrape(url)) except: log(1, "failed to extract findecl from %s" % url) elif key == 'Declarations of participation by Members in events organised by third parties': key = 'Declarations of Participation' mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) elif key in [ 'Declaration of good conduct', 'Voluntary confirmation on the use of the General Expenditure Allowance' ]: mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) else: log( 2, 'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations' % (key, id)) key = None raise ValueError # history parse_history(id, root, mep) process(mep, id, db.mep, 'ep_meps', mep['Name']['full'], nopreserve=(['Addresses'], ['assistants']), onchanged=onchanged) if __name__ == '__main__': return mep del mep
# You should have received a copy of the GNU Affero General Public License # along with parltrack If not, see <http://www.gnu.org/licenses/>. # (C) 2019 by Stefan Marsiske, <*****@*****.**>, Asciimoo from db import db from utils.process import process import requests if __name__ == "__main__": csv = requests.get( 'https://github.com/TechToThePeople/mep/raw/production/data/meps.nogender.csv' ).text genders = [l.split(',')[:2] for l in csv.split('\n')][1:-1] try: for mepid, gender in genders: mep = db.mep(int(mepid)) if not mep: print("meeeeeheeheheh", mepid) continue mep['Gender'] = gender process(mep, int(mepid), db.mep, 'ep_meps', mep['Name']['full'], nopreserve=(['Addresses'], ['assistants'])) finally: db.commit('ep_meps')