def genVocab(vocabfile): mysql=MySQL() mysql.login() cursor=mysql.get_cursor() vocab=defaultdict(int) def imdict(ab): for a in ab.split(" "): a=a.strip() # 去掉全是小写的英文单词 if len(a)==0 or (rec.match(a) and a.islower()) or (rec0.match(a)): continue vocab[a]+=1 urlset=set() dalist = [] tables=["news","crawldata"] for table in tables: sent="select title,brief,content,url from %s where 1"%table cursor.execute(sent) for title, brief, content,url in cursor.fetchall(): if url in urlset: continue else: urlset.add(url) title = Data.extract_html(title,False) imdict(title) if table=="news" and brief is not None: brief= re.sub("摘要:","",brief) brief = Data.extract_html(brief,False) imdict(brief) brieflen=len(brief) else:brieflen=0 content=re.sub("资料图(图源:.*?)","",content) try: content=Data.extract_html(content) except: continue time.sleep(0.1) imdict(content) contentlen=len(content) dalist.append([brieflen,contentlen]) data = pd.DataFrame(columns=["brief", "content"],data=dalist) data=data[data['brief']>0] data.to_csv("./data/len.csv",index=False) mysql.close() newvocab={Data.UNKNOWN_TOKEN:0,Data.PAD_TOKEN:-1,Data.SENTENCE_START:-1,Data.SENTENCE_END:-1} for key, value in vocab.items(): if value >= 5: newvocab.update({key:value}) else: newvocab[Data.UNKNOWN_TOKEN]+=value with open(vocabfile,'w') as f: for word,num in newvocab.items(): f.write(word+" "+str(num)+"\n")
class Spider(object): def __init__(self): self._headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'Sec-Fetch-Mode': 'no-cors', 'Host': 'arxiv.org' } self._sess = requests.Session() self._sleep_time = 5 self._mysql = MySQL() def _get_detail(self, url): while 1: try: content = self._sess.get(url, headers=self._headers).content except Exception as e: print(e) self._sess.close() self._sess = requests.Session() time.sleep(self._sleep_time) continue html = etree.HTML(content) title = html.xpath( '//h1[@class="title mathjax"]/text()')[0].strip() #authors = ','.join(html.xpath('//div[@class="authors"]/a/text()')).strip() abstract = html.xpath( '//blockquote[@class="abstract mathjax"]/text()')[0].strip() subjects = html.xpath( 'string(//td[@class="tablecell subjects"])').strip() arxiv = url.split('/')[-1] print(arxiv) return (arxiv, title, abstract, subjects) def crawl_arxiv_n(self, begin, stop): self._mysql.connect() # for month in ['07, 06, 05, 04, 03, 02, 01']: # try: index_error_count = 0 for i in range(begin, stop + 1): try: result = self._get_detail( 'https://arxiv.org/abs/1709.{:05d}'.format(i)) index_error_count = 0 self._mysql.execute( 'INSERT IGNORE INTO `rec_arxiv_paper` \ (`arxiv`, `title`, `abstract`, `subjects`) VALUES \ (%s, %s, %s, %s)', result) time.sleep(self._sleep_time // 5) if i % 150: self._sess.close() self._sess = requests.Session() except IndexError: index_error_count += 1 if index_error_count > 5: break # except IndexError: # continue self._mysql.close()
def main(): # verify that the necessary files exist battletag_from_cli = [] if len(sys.argv) == 1: try: verify_files_exists(REGION_CODES) except FileNotFoundError: exit(1) elif len(sys.argv) == 2: if not os.path.exists(sys.argv[1]): Log.write_log_message("Specified file does not exist, exiting...", True) btags = open(sys.argv[1], "r") for btag in btags: battletag_from_cli.append(btag.strip()) # get the API request parameters request_parameters = get_request_parameters() # get the current season ID season_id = -1 try: season_id = API.get_current_season_id(request_parameters) except RequestError as e: print(e) exit(1) Log.write_log_message("Current Season ID: {}".format(season_id)) db_handle = MySQL() for region in REGION_CODES: Log.write_log_message("Starting {} Region".format(region.upper())) # get ladders ladders = API.get_all_ladders(region, MAX_LEAGUE_ID, season_id, request_parameters) Log.write_log_message("Total Ladders Found: {}".format(len(ladders))) # add all of the ladders to the database try: add_ladders_to_database(db_handle, ladders) except MySQLdb.IntegrityError: Log.write_log_message( "Ladders are already in database for {}".format( region.upper())) # read in btags to a list if len(battletag_from_cli) == 0: battletags = get_battletags(region) else: battletags = battletag_from_cli num_battletags = len(battletags) Log.write_log_message("Battletags Read In: {}".format(num_battletags)) # go through every ladder looking for one of our players for ladder in ladders: # loop through every ladder between bronze and diamond # get all of the players in the ladder players = API.get_players_in_ladder(region, ladder, request_parameters) for player in players: # loop through every player in the ladder if [battletag.lower() for battletag in battletags ].__contains__(player.battletag.lower()): # a JSL contestant was found db_handle.add_player(player) for team in player.ladders: db_handle.add_race(player, team) for team in player.ladders: Log.write_log_message( "Found player: {} [{} {} {}]".format( player.battletag, team.league, team.divison, team.race)) # get all players in database Log.write_log_message("Writing valid player data to disk") valid_players = db_handle.get_all_valid_players() write_valid_players(valid_players) # close database db_handle.close()