def extract(self):
        '''对采集到的网页进行数据提取'''
        self.db = MySQL('job')
        jobs = self.findall('div.job-list>ul>li')
        for job in jobs:
            title = self.find('.job-title',job).text
            salary = self.find('.red',job).text

            job_link = self.find('.info-primary>h3.name>a',job).get_attribute('href')
            job_id = re.search(r'/job_detail/(.*).html',job_link).group(1)
            job_info = self.find('.info-primary>p',job).get_attribute('innerHTML')

            company_name = self.find(".info-company>div>h3>a",job).text
            company_link = self.find(".info-company>div>h3>a",job).get_attribute('href')
            company_id = re.search(r'/gongsi/(.*).html',company_link).group(1)

            html_str = self.find('.company-text>p',job).get_attribute('innerHTML')
            company_info = html_str.split('''<em class="vline"></em>''')
            company_industry = company_info[0]
            company_size = company_info.pop()
            publisher = self.find('.info-publis>h3',job).get_attribute('innerHTML').split('<em class="vline"></em>').pop()
            date_str = self.find('.info-publis p',job).text
            date_str = date_str.split('发布于')[1]
            try:
                pub_date = datetime.datetime.strptime(date_str,'%m月%d日')
                pub_date = pub_date.replace(year = 2019)
            except ValueError:
                pub_date = datetime.datetime.strptime('11月18日','%m月%d日')
                pub_date = pub_date.replace(year = 2019)
            data = dict(title=title,salary=salary,job_info=job_info.replace('\"','\''),job_id=job_id,company_name=company_name\
                ,company_id=company_id,company_industry=company_industry,company_size=company_size\
                ,publisher=publisher,pub_date=str(pub_date))
            self.write(data)
Beispiel #2
0
def genVocab(vocabfile):
    mysql=MySQL()
    mysql.login()
    cursor=mysql.get_cursor()

    vocab=defaultdict(int)
    def imdict(ab):
        for a in ab.split(" "):
            a=a.strip()
            # 去掉全是小写的英文单词
            if len(a)==0 or (rec.match(a) and a.islower()) or (rec0.match(a)):
                continue
            vocab[a]+=1

    urlset=set()
    dalist = []
    tables=["news","crawldata"]
    for table in tables:
        sent="select title,brief,content,url from %s where 1"%table
        cursor.execute(sent)

        for title, brief, content,url in cursor.fetchall():
            if url in urlset:
                continue
            else:
                urlset.add(url)
            title = Data.extract_html(title,False)
            imdict(title)

            if table=="news" and brief is not None:
                brief= re.sub("摘要:","",brief)
                brief = Data.extract_html(brief,False)
                imdict(brief)
                brieflen=len(brief)
            else:brieflen=0

            content=re.sub("资料图(图源:.*?)","",content)
            try:
               content=Data.extract_html(content)
            except:
                continue
            time.sleep(0.1)
            imdict(content)
            contentlen=len(content)
            dalist.append([brieflen,contentlen])

    data = pd.DataFrame(columns=["brief", "content"],data=dalist)
    data=data[data['brief']>0]
    data.to_csv("./data/len.csv",index=False)
    mysql.close()
    newvocab={Data.UNKNOWN_TOKEN:0,Data.PAD_TOKEN:-1,Data.SENTENCE_START:-1,Data.SENTENCE_END:-1}
    for key, value in vocab.items():
        if value >= 5:
            newvocab.update({key:value})
        else:
            newvocab[Data.UNKNOWN_TOKEN]+=value
    with open(vocabfile,'w') as f:
        for word,num in newvocab.items():
            f.write(word+" "+str(num)+"\n")
Beispiel #3
0
 def __init__(self):
     self._headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
         'Sec-Fetch-Mode': 'no-cors',
         'Host': 'arxiv.org'
     }
     self._sess = requests.Session()
     self._sleep_time = 5
     self._mysql = MySQL()
Beispiel #4
0
 def run(self):
     args = self.arguments()
     if (args.filenames):
         csvobject = CSVParser(args.filenames[0], args.filenames[1])
         csvobject.format_csv_files()
     elif (args.output):
         print('We are connecting to the database...')
         db = MySQL()  # Here is where you would pass in a MySQL connection
         db.write_to_file('SELECT * from foobar', args.output[0])
     else:
         assert False, "Unhandled"
Beispiel #5
0
 def __init__(self):
     self.user = sys.argv[1]
     self.password = sys.argv[2]
     self.timeframe = sys.argv[3]
     self.base_url = "%simportant" % (url)
     self.test_page = test_page
     self.test_page_cont = None
     self.art_data = {}
     self.instruments_dict = {}
     self.trades = {}
     self.all_trade_ideas = {}
     self.db = MySQL()
Beispiel #6
0
 def Authentication(self):
     path = MainCore()
     file = configparser.ConfigParser()
     file.read(path.resource('config.ini'))
     operation = MySQL(file.get('DATABASE', 'host'),
                       file.get('DATABASE', 'user'),
                       file.get('DATABASE', 'password'),
                       file.get('DATABASE', 'database'))
     result = operation.user_login(self.user_name.text(),
                                   self.password.text())
     if len(list(result[1])) == 0:
         self.label_error.setStyleSheet('color: red')
         self.label_error.setText('Access denied !')
     else:
         permission = result[1][0]
         RootWindow.show()
         self.close()
Beispiel #7
0
def ExampleGen(num_epochs=None):
    epoch = 0
    mysql=MySQL(sqldb="HWCom")
    mysql.login()
    cursor=mysql.get_cursor()
    while True:
        if num_epochs is not None and epoch >= num_epochs:
            break

        sent="select title,brief,content from news where brief !=''"
        cursor.execute(sent)
        for rows in cursor.fetchall():
            title, brief, content=rows
            content=extract_html(content)
            brief=extract_html(brief,False)
            yield (title,content,brief)

        epoch += 1
Beispiel #8
0
    def create_db(self):
        def field():
            for i in list(self.get_config):
                if i == '':
                    self.status_label.setStyleSheet("color: red;")
                    self.status_label.setText('Empty field !')
                    return False
                else:
                    return True

        if field():
            db = MySQL(self.get_config[0], self.get_config[1],
                       self.get_config[2], self.get_config[3])
            db.CreateDatabase()
            if db.CreateTables():
                self.status_label.setStyleSheet("color: green;")
                self.status_label.setText('Success Connect')
                self.save_btn.setEnabled(True)
            else:
                self.status_label.setStyleSheet("color: red;")
                self.status_label.setText("Access denied!")
def main():
    # verify that the necessary files exist
    battletag_from_cli = []
    if len(sys.argv) == 1:
        try:
            verify_files_exists(REGION_CODES)
        except FileNotFoundError:
            exit(1)
    elif len(sys.argv) == 2:
        if not os.path.exists(sys.argv[1]):
            Log.write_log_message("Specified file does not exist, exiting...",
                                  True)
        btags = open(sys.argv[1], "r")
        for btag in btags:
            battletag_from_cli.append(btag.strip())

    # get the API request parameters
    request_parameters = get_request_parameters()

    # get the current season ID
    season_id = -1
    try:
        season_id = API.get_current_season_id(request_parameters)
    except RequestError as e:
        print(e)
        exit(1)
    Log.write_log_message("Current Season ID: {}".format(season_id))

    db_handle = MySQL()

    for region in REGION_CODES:
        Log.write_log_message("Starting {} Region".format(region.upper()))

        # get ladders
        ladders = API.get_all_ladders(region, MAX_LEAGUE_ID, season_id,
                                      request_parameters)
        Log.write_log_message("Total Ladders Found: {}".format(len(ladders)))

        # add all of the ladders to the database
        try:
            add_ladders_to_database(db_handle, ladders)
        except MySQLdb.IntegrityError:
            Log.write_log_message(
                "Ladders are already in database for {}".format(
                    region.upper()))

        # read in btags to a list
        if len(battletag_from_cli) == 0:
            battletags = get_battletags(region)
        else:
            battletags = battletag_from_cli
        num_battletags = len(battletags)
        Log.write_log_message("Battletags Read In: {}".format(num_battletags))

        # go through every ladder looking for one of our players
        for ladder in ladders:
            # loop through every ladder between bronze and diamond

            # get all of the players in the ladder
            players = API.get_players_in_ladder(region, ladder,
                                                request_parameters)

            for player in players:
                # loop through every player in the ladder

                if [battletag.lower() for battletag in battletags
                    ].__contains__(player.battletag.lower()):
                    # a JSL contestant was found
                    db_handle.add_player(player)

                    for team in player.ladders:
                        db_handle.add_race(player, team)

                    for team in player.ladders:
                        Log.write_log_message(
                            "Found player: {} [{} {} {}]".format(
                                player.battletag, team.league, team.divison,
                                team.race))

    # get all players in database
    Log.write_log_message("Writing valid player data to disk")
    valid_players = db_handle.get_all_valid_players()
    write_valid_players(valid_players)

    # close database
    db_handle.close()
Beispiel #10
0
def get_product(book, gui):
    '''Get product form Woo'''

    dictionary = {}

    try:
        # Get product ID from DB.
        mysql_request = MySQL(isbn=book)
        mysql_response = mysql_request.db_mysql()

        if mysql_response:
            product = WooCommerce(book=book)
            request = product.get_woo_product(mysql_response)

            if request:
                try:
                    dictionary["id"] = request["id"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    for attribute in request["attributes"]:
                        dictionary[product.get_translation(attribute["name"],
                                                           "en")] = product.list_expander(attribute["options"]).replace("amp;", "")  # pylint: disable=line-too-long
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    dictionary["name"] = request["name"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    dictionary["description"] = request["description"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    categories_list = []
                    categories = request["categories"]
                    for category in categories:
                        categories_list.append(category["name"].replace(
                            "amp;", ""))

                    dictionary["categories"] = categories_list
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    tags_list = []
                    tags = request["tags"]
                    for tag in tags:
                        tags_list.append(tag["name"].replace("amp;", ""))

                    dictionary["tags"] = tags_list
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    dictionary["image"] = request["images"][0]["src"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    dictionary["price"] = request["regular_price"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    dictionary["sale_price"] = request["sale_price"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                try:
                    dictionary["amount"] = request["stock_quantity"]
                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                for key in gui:
                    if "_box" in key and gui[key]:
                        if key.split('_box')[0] not in dictionary:
                            dictionary[key.split('_box')[0]] = None

                        if dictionary[key.split('_box')[0]] == "":
                            dictionary[key.split('_box')[0]] = None

    except Exception as error:  # pylint: disable=broad-except
        logger.info(error)

    return dictionary
Beispiel #11
0
    def post_woo_products(self):
        '''Post WooCommerce product'''
        try:
            # Auth
            auth = self.get_woo_request()
            # Upload image to media
            image = wp(self.book["image"])

            data = {
                "name":
                self.book["name"],
                "description":
                self.book["description"],
                "sku":
                self.book["isbn"],
                "categories": [],
                "tags": [],
                "attributes": [
                    {
                        "id": 1,
                        "name": "Tytuł",  # cspell: disable-line
                        "position": 1,
                        "visible": True,
                        "variation": True,
                        "options": [self.book["title"]]
                    },
                    {
                        "id": 2,
                        "name": "Autor",  # cspell: disable-line
                        "position": 2,
                        "visible": True,
                        "variation": True,
                        "options": [self.book["authors"]]
                    },
                    {
                        "id": 3,
                        "name": "Wydawnictwo",  # cspell: disable-line
                        "position": 3,
                        "visible": True,
                        "variation": True,
                        "options": [self.book["publisher"]]
                    },
                    {
                        "id": 4,
                        "name": "Rok wydania",  # cspell: disable-line
                        "position": 4,
                        "visible": True,
                        "variation": True,
                        "options": [self.book["publish_date"]]
                    },
                    {
                        "id": 5,
                        "name": "Okładka",  # cspell: disable-line
                        "position": 5,
                        "visible": True,
                        "variation": True,
                        "options": [self.book["binding"]]
                    },
                    {
                        "id": 6,
                        "name": "ISBN",
                        "position": 6,
                        "visible": True,
                        "variation": True,
                        "options": [self.book["isbn"]]
                    }
                ]
            }

            # Tags
            try:
                if self.book["tags"]:
                    tags = self.validate_tags()
                    for tag in tags:
                        data["tags"].append({'id': tag})
            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            # Image
            try:
                if image:
                    data["images"] = [{"src": image}]
            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            # Price
            try:
                if self.book["price"]:
                    data["regular_price"] = self.book["price"]
            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            # Sale Price
            try:
                if self.book["sale_price"]:
                    data["sale_price"] = self.book["sale_price"]
            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            # Amount
            try:
                if self.book["amount"]:
                    data["manage_stock"] = True
                    data["stock_quantity"] = self.book["amount"]
            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            # Get category ID
            try:
                categories = self.validate_category()
                for category in categories:
                    data["categories"].append({'id': category})
            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            # Send request
            response = auth.post("products", data).json()

            # Send none if status code found in error codes
            if "data" in response:
                if response.get("data", {}).get("status") in self.error_codes:
                    self.error_catch.append(inspect.getouterframes(inspect.currentframe())[0].function)  # pylint: disable=line-too-long
                    return None

            # Format output
            try:
                output = {
                    'id': response["id"],
                    'name': response["name"],
                    'link': response["permalink"],
                    'source': False
                }

            except Exception as error:  # pylint: disable=broad-except
                logger.info(error)

            if response["data"]["status"] == 400:

                try:
                    mysql_request = MySQL(isbn=self.book["isbn"])
                    request = mysql_request.db_mysql()

                except Exception as error:  # pylint: disable=broad-except
                    logger.info(error)

                if request:
                    product = self.get_woo_product(request)
                    if product["stock_quantity"]:
                        data["stock_quantity"] = int(data["stock_quantity"]) + product["stock_quantity"]  # pylint: disable=line-too-long

                    try:
                        response = self.update_woo_products(
                            product["id"], data)
                        output = {
                            'id': response["id"],
                            'name': response["name"],
                            'link': response["permalink"],
                            'source': True
                        }

                        return output

                    except Exception as error:  # pylint: disable=broad-except
                        logger.info(error)
                else:
                    return None

        except Exception as error:  # pylint: disable=broad-except
            logger.info(error)
        return output