Esempio n. 1
0
def genVocab(vocabfile):
    mysql=MySQL()
    mysql.login()
    cursor=mysql.get_cursor()

    vocab=defaultdict(int)
    def imdict(ab):
        for a in ab.split(" "):
            a=a.strip()
            # 去掉全是小写的英文单词
            if len(a)==0 or (rec.match(a) and a.islower()) or (rec0.match(a)):
                continue
            vocab[a]+=1

    urlset=set()
    dalist = []
    tables=["news","crawldata"]
    for table in tables:
        sent="select title,brief,content,url from %s where 1"%table
        cursor.execute(sent)

        for title, brief, content,url in cursor.fetchall():
            if url in urlset:
                continue
            else:
                urlset.add(url)
            title = Data.extract_html(title,False)
            imdict(title)

            if table=="news" and brief is not None:
                brief= re.sub("摘要:","",brief)
                brief = Data.extract_html(brief,False)
                imdict(brief)
                brieflen=len(brief)
            else:brieflen=0

            content=re.sub("资料图(图源:.*?)","",content)
            try:
               content=Data.extract_html(content)
            except:
                continue
            time.sleep(0.1)
            imdict(content)
            contentlen=len(content)
            dalist.append([brieflen,contentlen])

    data = pd.DataFrame(columns=["brief", "content"],data=dalist)
    data=data[data['brief']>0]
    data.to_csv("./data/len.csv",index=False)
    mysql.close()
    newvocab={Data.UNKNOWN_TOKEN:0,Data.PAD_TOKEN:-1,Data.SENTENCE_START:-1,Data.SENTENCE_END:-1}
    for key, value in vocab.items():
        if value >= 5:
            newvocab.update({key:value})
        else:
            newvocab[Data.UNKNOWN_TOKEN]+=value
    with open(vocabfile,'w') as f:
        for word,num in newvocab.items():
            f.write(word+" "+str(num)+"\n")
Esempio n. 2
0
class Spider(object):
    def __init__(self):
        self._headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'Sec-Fetch-Mode': 'no-cors',
            'Host': 'arxiv.org'
        }
        self._sess = requests.Session()
        self._sleep_time = 5
        self._mysql = MySQL()

    def _get_detail(self, url):
        while 1:
            try:
                content = self._sess.get(url, headers=self._headers).content
            except Exception as e:
                print(e)
                self._sess.close()
                self._sess = requests.Session()
                time.sleep(self._sleep_time)
                continue
            html = etree.HTML(content)
            title = html.xpath(
                '//h1[@class="title mathjax"]/text()')[0].strip()
            #authors = ','.join(html.xpath('//div[@class="authors"]/a/text()')).strip()
            abstract = html.xpath(
                '//blockquote[@class="abstract mathjax"]/text()')[0].strip()
            subjects = html.xpath(
                'string(//td[@class="tablecell subjects"])').strip()
            arxiv = url.split('/')[-1]
            print(arxiv)
            return (arxiv, title, abstract, subjects)

    def crawl_arxiv_n(self, begin, stop):
        self._mysql.connect()
        # for month in ['07, 06, 05, 04, 03, 02, 01']:
        #     try:
        index_error_count = 0
        for i in range(begin, stop + 1):
            try:
                result = self._get_detail(
                    'https://arxiv.org/abs/1709.{:05d}'.format(i))
                index_error_count = 0
                self._mysql.execute(
                    'INSERT IGNORE INTO `rec_arxiv_paper` \
                    (`arxiv`, `title`, `abstract`, `subjects`) VALUES \
                    (%s, %s, %s, %s)', result)
                time.sleep(self._sleep_time // 5)
                if i % 150:
                    self._sess.close()
                    self._sess = requests.Session()
            except IndexError:
                index_error_count += 1
                if index_error_count > 5:
                    break
            # except IndexError:
            #     continue
        self._mysql.close()
Esempio n. 3
0
def main():
    # verify that the necessary files exist
    battletag_from_cli = []
    if len(sys.argv) == 1:
        try:
            verify_files_exists(REGION_CODES)
        except FileNotFoundError:
            exit(1)
    elif len(sys.argv) == 2:
        if not os.path.exists(sys.argv[1]):
            Log.write_log_message("Specified file does not exist, exiting...",
                                  True)
        btags = open(sys.argv[1], "r")
        for btag in btags:
            battletag_from_cli.append(btag.strip())

    # get the API request parameters
    request_parameters = get_request_parameters()

    # get the current season ID
    season_id = -1
    try:
        season_id = API.get_current_season_id(request_parameters)
    except RequestError as e:
        print(e)
        exit(1)
    Log.write_log_message("Current Season ID: {}".format(season_id))

    db_handle = MySQL()

    for region in REGION_CODES:
        Log.write_log_message("Starting {} Region".format(region.upper()))

        # get ladders
        ladders = API.get_all_ladders(region, MAX_LEAGUE_ID, season_id,
                                      request_parameters)
        Log.write_log_message("Total Ladders Found: {}".format(len(ladders)))

        # add all of the ladders to the database
        try:
            add_ladders_to_database(db_handle, ladders)
        except MySQLdb.IntegrityError:
            Log.write_log_message(
                "Ladders are already in database for {}".format(
                    region.upper()))

        # read in btags to a list
        if len(battletag_from_cli) == 0:
            battletags = get_battletags(region)
        else:
            battletags = battletag_from_cli
        num_battletags = len(battletags)
        Log.write_log_message("Battletags Read In: {}".format(num_battletags))

        # go through every ladder looking for one of our players
        for ladder in ladders:
            # loop through every ladder between bronze and diamond

            # get all of the players in the ladder
            players = API.get_players_in_ladder(region, ladder,
                                                request_parameters)

            for player in players:
                # loop through every player in the ladder

                if [battletag.lower() for battletag in battletags
                    ].__contains__(player.battletag.lower()):
                    # a JSL contestant was found
                    db_handle.add_player(player)

                    for team in player.ladders:
                        db_handle.add_race(player, team)

                    for team in player.ladders:
                        Log.write_log_message(
                            "Found player: {} [{} {} {}]".format(
                                player.battletag, team.league, team.divison,
                                team.race))

    # get all players in database
    Log.write_log_message("Writing valid player data to disk")
    valid_players = db_handle.get_all_valid_players()
    write_valid_players(valid_players)

    # close database
    db_handle.close()