Example #1
0
    def __init__(self):
        if crawler_config.debug:
            self.db = MongoDB('test_stock_crawler')
        else:
            self.db = MongoDB('stock_crawler')

        self.url = ""
Example #2
0
 def __init__(self):
     self.mongo = MongoDB()
     self.news_url_queue = news_url_Queue()  # 存新闻url,用于多线程爬取
     self.news_html_queue = news_url_Queue()  # 存新闻html
     self.old_day_news_queue = news_url_Queue()
     # self.log = Logging('../helloword/static/sina').get_logging()
     self.log = Logging('../Sina/sina.txt').get_logging()
Example #3
0
 def read_data_from_coupon_db(self, id=None):
     flag = 0
     print "id = ",id
     if not id:
         return MongoDB().read_from_coupon_collection()
     else:
         for data_entry in MongoDB().read_from_coupon_collection():
             if id in data_entry.values():
                 print "data_entry = ", data_entry
                 reqd_data = data_entry
                 flag = 1
         if flag == 1:
             del reqd_data['_id']
             return reqd_data
         else:
             return 'Data not available'
    def crawl_zhilian(self, city, keyword):
        #url_list = []  # todo url_list 做成堆栈形式
        begin_url = 'https://fe-api.zhaopin.com/c/i/sou?start={page}&pageSize=90&cityId={city}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={keyword}&kt=3'
        database = MongoDB('zhilian', self.keywords[keyword])

        url_list = self._get_list(begin_url, city, keyword, page_weight=90)

        print(keyword, city, 'list parser done!')
        print(len(url_list))

        self._get_content(database, url_list)
    def crawl_qiancheng(self, city, keyword):
        begin_url = 'https://search.51job.com/list/{city},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        database = MongoDB('qiancheng', self.keywords[keyword])

        url_list = self._get_list(begin_url,
                                  city,
                                  keyword,
                                  page_begin=1,
                                  web_name='qiancheng')

        print(keyword, city, 'list parser done!')
        if url_list:
            print(len(url_list))

        self._get_content(database, url_list, web_name='qiancheng')
    def crawl_liepin(self, city, keyword):
        begin_url = "https://www.liepin.com/city-{city}/zhaopin/pn{page}/?d_pageSize=40&jobKind=2&key={keyword}"
        database = MongoDB('liepin', self.keywords[keyword])

        url_list = self._get_list(begin_url,
                                  city,
                                  keyword,
                                  page_begin=0,
                                  web_name='liepin')

        print(keyword, city, 'list parser done!')
        if url_list:
            print(len(url_list))

        self._get_content(database, url_list, web_name='liepin')
Example #7
0
def main():
    m = MongoDB()
    tasks = []

    all_pairs = m.get_all_pairs()

    for address in all_pairs[:100]:
        pair = Web3.toChecksumAddress(address)
        task = log_loop(pair, 60)
        tasks.append(task)
    print('{} Starting...'.format(len(tasks)))

    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(asyncio.gather(*tasks))
    finally:
        loop.close()
Example #8
0
    def get_all_status_thread(self,
                              status_list=[],
                              collect_name='status',
                              trim_user=True,
                              include_entities=True):

        wrapper_func = handle_exception(self.get_status)

        db = MongoDB().connect()
        collect = db[collect_name]

        while len(status_list) > 0:
            status_id = status_list.pop(0)
            status_obj = wrapper_func(status_id)

            status = self.tweetobj_to_dict(status_obj)

            if not status:
                continue

            try:
                collect.insert_one(status)
            except Exception as e:
                continue
Example #9
0
def getCookies(weibo):
    db = MongoDB()
    cookies = []
    loginURL = 'https://passport.weibo.cn/signin/login'

    if db.Cookies.count() < 10:

        print '-----------------------------------------'
        print 'Start crawl cookies'
        print '-----------------------------------------'

        for elem in weibo:
            account = elem['no']
            password = elem['psw']
            item = {'account':account}
            if db.find_cookie(item):
                continue
            try:
                driver = webdriver.Chrome()
                driver.get(loginURL)
                time.sleep(2)

                failure = 0
                while "登录 - 新浪微博" in driver.title and failure < 5:
                    failure += 1
                    driver.set_window_size(1920, 1080)
                    username = driver.find_element_by_id("loginName")
                    username.clear()
                    username.send_keys(account)

                    psd = driver.find_element_by_id("loginPassword")
                    psd.clear()
                    psd.send_keys(password)

                    commit = driver.find_element_by_id("loginAction")
                    commit.click()
                    time.sleep(10)

                # cookie=driver.get_cookies()
                # print cookie
                cookie = {}
                if "微博 - 随时随地发现新鲜事" in driver.title:
                    for elem in driver.get_cookies():
                        cookie[elem["name"]] = elem["value"]
                    if len(cookie) > 0:
                        item = {'account': account,
                                'password': password, 'cookie': cookie}
                        db.Cookies.insert_one(item)
                        cookies.append(cookie)
                        print "*******************************"
                        print "Get Cookie Successful: %s!!!!!!" % account
                        print "*******************************"
                        continue
                print "*******************************"
                print "Get Cookie Failed: %s!" % account
                print "*******************************"

            except Exception, e:
                print "*******************************"
                print "%s Failure!!!!!" % account
                print e
                print "*******************************"

            finally:
Example #10
0
        decimals = 18
    except Exception as e:
        traceback.print_exc()
        sys.exit(-1)
    print('{} {}({}) {}'.format(token, name, symbol, decimals))
    doc = {
        'address': token,
        'name': name,
        'symbol': symbol,
        'decimals': decimals,
    }
    return doc


if __name__ == '__main__':
    db = MongoDB()
    tokens = db.get_all_tokens()
    parse_tokens = []
    for token in tokens:
        if db.get_token(token) is None:
            parse_tokens.append(token)
            # if len(parse_tokens) == 6:
            #     break
    print(len(parse_tokens))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for token in parse_tokens:
            futures.append(executor.submit(parse_token, token))
        for future in concurrent.futures.as_completed(futures):
            doc = future.result()
Example #11
0
    def get_user_all_timeline(self,
                              user_id=None,
                              collect_name="tweets_task",
                              screen_name=None,
                              include_rts=True,
                              exclude_replies=False):

        if user_id == None and screen_name == None:
            return None

        if user_id:
            try:
                user_id = long(user_id)
            except Exception as e:
                print e
                return None

        flag = True
        tweets = [0]
        sleep_count = 0

        db = MongoDB().connect()
        collect = db[collect_name]
        get_api = self.get_api

        while len(tweets) > 0:
            try:
                if flag:
                    tweets = get_api().GetUserTimeline(
                        user_id=user_id,
                        screen_name=screen_name,
                        include_rts=include_rts,
                        exclude_replies=exclude_replies,
                        trim_user=True,
                        count=200)
                    flag = False

                else:
                    tweets = get_api().GetUserTimeline(
                        user_id=user_id,
                        screen_name=screen_name,
                        include_rts=include_rts,
                        exclude_replies=exclude_replies,
                        trim_user=True,
                        count=200,
                        max_id=tweets[-1].id - 1)

            except error.TwitterError as te:
                try:
                    if te.message == 'Not authorized.':
                        print 'Not authorized.'
                        return

                    if te.message[0]['code'] == 88:
                        sleep_count += 1

                        if sleep_count >= API_COUNT:
                            print "sleeping..."
                            sleep_count = 0
                            time.sleep(300)
                        continue

                    else:
                        print te
                        break
                except Exception as ee:
                    print ee
                    break
            except Exception as e:
                break

            for tt in tweets:
                tweet = self.tweetobj_to_dict(tt)

                if not tweet:
                    continue

                try:
                    collect.insert_one(tweet)
                except Exception as e:
                    continue
        logger.info(f'Converting: {src_collection.__name__} ({db.get_collection_size(src_collection)})'
                    f' -> {dst_collection.__name__} ({db.get_collection_size(dst_collection)})')

    docs = src_collection.objects()
    total_count = docs.count()
    for current_count, src_doc in enumerate(docs):
        log_progress(current_count, total_count)

        try:
            mapped_doc = map_document(src_doc)
        except (DocumentConversionError, DocumentConstructionError) as e:
            logger.warning(f'Skipping: {src_doc} because of: {e}')
            continue

        mapped_doc.create_or_update()

    with db.connect():
        logger.info(f'Total {dst_collection.__name__} count: ({db.get_collection_size(dst_collection)})')
        logger.info(f'Documents.Conclusion count: ({db.get_collection_size(Documents.Conclusion)})')
        logger.info(f'Documents.License count: ({db.get_collection_size(Documents.License)})')


if __name__ == '__main__':
    logger = root_logger('convert_data', logging.INFO)
    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        convert_data(Documents.FileRaw, Documents.File)
        logger.info('Success')
    except Exception as e:
        logger.info(e, exc_info=True)
Example #13
0
    async def get_and_store(self, device):
        """ Get snmp infomation and add to database
        """
        mongo = MongoDB()

        host = device.ip
        community = device.snmp_community
        port = device.snmp_port

        results = await asyncio.gather(
            asyncio.ensure_future(get_system_info(host, community, port)),
            asyncio.ensure_future(get_routes(host, community, port)),
            asyncio.ensure_future(get_ip_addr(host, community, port)),
            asyncio.ensure_future(get_interfaces(host, community, port)),
            asyncio.ensure_future(get_cdp(host, community, port)),
            # asyncio.ensure_future(get_lldp(host, community, port)), # Todo
        )

        if all(r is None for r in results):
            logging.debug("SNMP Server for device ip %s is gone down", host)
            return

        system_info = results[0]
        routes = results[1]
        ip_addrs = results[2]
        interfaces = results[3]
        # CDP
        cdp = results[4]
        # LLDP
        # lldp = results[5]

        # Todo optimize this
        # for if_index, interface in enumerate(interfaces):
        #     for ip_index, ip_addr in enumerate(ip_addrs):
        #         if interface['index'] == ip_addr['if_index']:
        #             interface['ipv4_address'] = ip_addr['ipv4_address']
        #             interface['subnet'] = ip_addr['subnet']

        for if_index in range(len(interfaces)):
            for ip_index in range(len(ip_addrs)):
                if interfaces[if_index]['index'] == ip_addrs[ip_index][
                        'if_index']:
                    interfaces[if_index]['ipv4_address'] = ip_addrs[ip_index][
                        'ipv4_address']
                    interfaces[if_index]['subnet'] = ip_addrs[ip_index][
                        'subnet']
                    break

        # print(interfaces[0])
        my_device = mongo.db.device.find_one({'device_ip': host})

        if my_device:
            for interface in interfaces:
                for my_interface in my_device['interfaces']:
                    if interface['description'] == my_interface['description']:
                        # In
                        in_octets = interface['out_octets'] - my_interface[
                            'out_octets']
                        in_in_time = system_info['uptime'] - my_device['uptime']
                        bw_in_usage_percent = sdn_utils.cal_bw_usage_percent(
                            in_octets, interface['speed'], in_in_time)
                        # Out
                        out_octets = interface['out_octets'] - my_interface[
                            'out_octets']
                        out_in_time = system_info['uptime'] - my_device[
                            'uptime']
                        bw_out_usage_percent = sdn_utils.cal_bw_usage_percent(
                            out_octets, interface['speed'], out_in_time)

                        # Add information
                        interface['bw_in_usage_octets'] = in_octets
                        interface['bw_in_usage_percent'] = bw_in_usage_percent

                        interface['bw_out_usage_octets'] = out_octets
                        interface[
                            'bw_out_usage_percent'] = bw_out_usage_percent

                        interface['bw_usage_update'] = time.time()

                        logging.debug(' || BW in usage %.3f || %d bytes',
                                      bw_in_usage_percent, in_octets)

                        logging.debug(' || BW out usage %.3f || %d bytes',
                                      bw_out_usage_percent, out_octets)
                        break

        system_info['interfaces'] = interfaces

        # Clear old routes
        mongo.db.route.delete_many({'device_ip': host})

        # Insert net routes
        mongo.db.route.insert_many(routes)
        mongo.device.update_one({'ipv4_address': host}, {'$set': system_info},
                                upsert=True)

        # Insert CDP
        mongo.db.cdp.update_one({'device_ip': host},
                                {'$set': {
                                    'device_ip': host,
                                    'neighbor': cdp
                                }},
                                upsert=True)
Example #14
0
 def drop_coupon_table(self):
     MongoDB().drop_coupon_collection()
Example #15
0
 def write_data_to_coupon_db(self, info):
     MongoDB().write_to_coupon_collection(info)
Example #16
0
 def drop_product_table(self):
     MongoDB().drop_product_collection()
Example #17
0
 def write_data_to_product_db(self, info):
     MongoDB().write_to_product_collection(info)
Example #18
0
def run_server():
    global db
    db = MongoDB()
    run(host=host, port=port)
    db.clear_db()
    # data['临床表现'] = data['临床表现'].str.replace(r'等.*?(?:\s|$)|[,。,.、;;]', ' ')
    # data['临床表现'] = data['临床表现'].str.replace(r'或|常伴|伴有?|发生|[轻甚]则|甚至', '')
    # data['临床表现'] = data['临床表现'].str.replace(r'[^\s]{9,}', ' ')
    # data['临床表现'] = data['临床表现'].str.replace(r'\s+', ' ')
    data['临床表现'] = data['临床表现'].str.strip()


    data.drop_duplicates('病证', 'last', inplace=True)
    # data = data['临床表现'].str.split().tolist()
    # data = [j for i in data for j in i]
    # counter = Counter(data)
    
    # print(data['临床表现'])
    # data.to_excel('bz2.xls', index=False)
    # bz = pd.read_excel('bz.xls')[['病症', '临床表现']]
    mongo = MongoDB()
    food_info = mongo.find_all('diet_merge', projection={'name': 1, 'ingredients': 1, 'syndrome': 1})

    food_info_df = pd.DataFrame(data=[[f['name'], f['ingredients'], f['syndrome']] for f in food_info], columns=['食疗方', '食材', '主治'])
    
    food_info_df['存在关联'] = 0
    food_info_df['主治'] = food_info_df['主治'].str.replace('证', '').str.replace('型', '')
    food_info_df.loc[food_info_df['主治'] != '', '主治'] = food_info_df['主治'] + '证'
    food_bz = [item['syndrome'] for item in food_info]
    food_bz = list(filter(None, food_bz))
    print('Food(Total): {}'.format(len(food_bz)))
    food_bz = set([item.replace('证', '').replace('型', '') for item in food_bz])
    results = []
    n_valid_bz = 0
    valid_bz_set = set()
    for item in food_bz:
Example #20
0
# influx.py
import time
from datetime import datetime
import concurrent

from helper import DeFiContract
from web3 import Web3
from web3 import exceptions
from kfk import KafkaDB
from database import MongoDB

k = KafkaDB()
m = MongoDB()


def scrapReserves(pair_address):
    pair = Web3.toChecksumAddress(pair_address)
    contract = DeFiContract(pair, 'Pair')
    r0, r1, _ = contract.getReserves()

    print('{} {} {}'.format(pair, r0, r1))
    doc = {
        'address': pair,
        'r0': r0,
        'r1': r1,
        't': datetime.utcnow().timestamp(),
    }
    return doc


while True:
Example #21
0
# -*- coding: utf-8 -*-

from database import MongoDB
import check, os
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

Token = "376593798:AAHMNABESGpXiFGiQ8Bg-0CnHc2EwyXD1hk"

updater = Updater(token=Token)

dispatcher = updater.dispatcher

mongodb = MongoDB()

admins = ["utkucanbykl", "vlademir92", "badgeekluck"]

users = ["utkucanbykl", "vlademir92", "badgeekluck"]


def start(bot, update):

    bot.sendMessage(chat_id=update.message.chat_id, text="Bot çalışıyor.")


def hello(bot, update):

    bot.sendMessage(chat_id=update.message.chat_id,
                    text="Hello " + update.message.from_user.first_name)


def echo(bot, update):
Example #22
0
from core import PastePwn
from scraping.pastebin import PastebinScraper
from database import MongoDB

logdir_path = os.path.dirname(os.path.abspath(__file__))
logfile_path = os.path.join(logdir_path, "logs", "pastepwn.log")

if not os.path.exists(os.path.join(logdir_path, "logs")):
    os.makedirs(os.path.join(logdir_path, "logs"))

logfile_handler = logging.handlers.WatchedFileHandler(logfile_path, "a", "utf-8")

logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG, handlers=[logfile_handler, logging.StreamHandler()])

# Framework code
database = MongoDB(ip="192.168.240.128")

pastepwn = PastePwn(database)
pastepwn.add_scraper(PastebinScraper())

telegram_action = TelegramAction(token="token", receiver="-1001348376474")

mail_analyzer = MailAnalyzer(telegram_action)
premium_analyzer = WordAnalyzer(telegram_action, "premium")

pastepwn.add_analyzer(mail_analyzer)
pastepwn.add_analyzer(premium_analyzer)

pastepwn.start()