Example #1
0
File: api.py Project: soltysh/blast
 def __init__(self):
     if 'TEXT_DB_SERVICE_HOST' in os.environ:
         self.db = Mongo(os.getenv('MONGODB_USER'), \
             os.getenv('MONGODB_PASSWORD'), \
             os.getenv('TEXT_DB_SERVICE_HOST'), \
             os.getenv('TEXT_DB_SERVICE_PORT'))
     else:
         self.db = Mongo('user', 'password', 'localhost', '27017')
Example #2
0
 def __init__(self):
     if 'DATABASE_SERVICE_HOST' in os.environ:
         self.db = Mongo(os.getenv('MONGODB_USER'), \
             os.getenv('MONGODB_PASSWORD'), \
             os.getenv('DATABASE_SERVICE_HOST'), \
             os.getenv('DATABASE_SERVICE_PORT'),
             os.getenv('MONGODB_DATABASE'))
     else:
         self.db = Mongo('user', 'password', 'localhost', '27017',
                         'catcatgo_db')
def create_graph():
    '''
    create a vtuber relationd graph
    '''
    PAGE_SIZE = 20
    vgraph = networkx.Graph()
    client = Mongo(CONFIG["mongo"]["addr"], 'youtube')
    vtubers = client.loadWholeDoc('vtuber')
    video_num = client.loadWholeDoc('videosv2').count()
    for i in range(0, video_num, PAGE_SIZE):
        v_page = client.loadWholeDoc('videosv2').skip(i).limit(PAGE_SIZE)
        for v in v_page:
            owner = v['channelId']
            try:
                desc = v['description']
            except KeyError:
                # some times des is not exist in video
                continue
            extract_relation(owner, desc, vgraph)
    # preview
    # networkx.draw(vgraph, with_labels=True, font_weight='bold')
    # plt.show()
    # split the name from vtb list
    name_dict = {}
    vtb_dict = {}
    for v in vtubers:
        name = v['channel']
        try:
            channel_id = v['channel_url'].split('/')[-1]
        except:
            continue
        name_dict[channel_id] = name
        vtb_dict[channel_id] = v

    # update the position and size of node in graph
    pos = networkx.random_layout(vgraph)

    # clean out all node who is not in vtuber list
    remove_list = []
    for _n in vgraph.nodes():
        if not _n in name_dict.keys():
            remove_list.append(_n)
    vgraph.remove_nodes_from(remove_list)

    for _n in vgraph.nodes():
        vgraph.node[_n]['viz']['position'] = {
            'x': pos[_n][0] * (-100) * 1.5,
            'y': pos[_n][1] * 100,
            'z': 0
        }
        # print(math.log2(vtb_dict[_n]['regsit']))
        #vgraph.node[_n]['viz']['size'] = vgraph.degree[_n]
        vgraph.node[_n]['viz']['size'] = (math.log2(vtb_dict[_n]['regsit']) -
                                          10) * 10
    vgraph = networkx.relabel_nodes(vgraph, name_dict)
    # networkx.write_gexf(vgraph, '../data/vtb.gexf')
    return vgraph
Example #4
0
def main():
    fname = config.log_path + 'article_parse.' + time.strftime("%Y%m%d")
    log.set_logger(level='DEBUG', when="D", limit=1, filename=fname)
    alist = Mongo().scan()
    if not alist:
        log.warn("no articles in mongodb")
        return False

    MyObj = Mysql()

    mobj = Mongo()
    for doc in alist:
        if Parse(MyObj).do(doc):
            mobj.update(doc.get("_id"), done=1)
            log.info("insert mysql success, url:%s" % doc.get('url'))
        else:
            mobj.update(doc.get("_id"), done=-1)
            log.warning("insert mysql failure, task_id:%s, url:%s" %
                        (doc.get('taskid'), doc.get('url')))
Example #5
0
def compute_ratings_matrix(ratings_matrix_file):
    """
    Computes the rating matrix
        Input:
            ratings_matrix_file: Filename output rating matrix
    """

    mongo = Mongo('Acme-Supermarket')
    mongo.connect()

    matrix_file = ratings_matrix_file
    hdf5_matrix = tables.openFile(matrix_file, mode='w')

    filters = tables.Filters(complevel=5, complib='blosc')

    products = mongo.database.products.find({}, {'_id': 1})
    products = [p['_id'] for p in products]
    products = numpy.concatenate((numpy.array([-1]), products))
    products_count = mongo.database.products.count()

    customers = mongo.database.actors.find({'_type': 'Customer'}, {'_id': 1})
    customers = [c['_id'] for c in customers]
    customers_count = mongo.database.actors.count({'_type': 'Customer'})

    data_storage = hdf5_matrix.createEArray(hdf5_matrix.root,
                                            'data',
                                            tables.UInt32Atom(),
                                            shape=(0, products_count + 1),
                                            filters=filters,
                                            expectedrows=customers_count)

    data_storage.append(products[:][None])
    for customer_id in customers:
        # Each column 0: Customer IDs
        # Product ratings in columns 1+
        row = numpy.zeros((products_count + 1, ))

        row[0] = customer_id
        ratings = mongo.database.rates.find({'customer_id': customer_id}, {
            'product_id': 1,
            'value': 1
        })

        for rating in ratings:
            row[numpy.where(
                products == rating['product_id'])[0][0]] = rating['value']

        data_storage.append(row[:][None])

    hdf5_matrix.close()
    mongo.disconnect()

    return matrix_file
def gen_mock_data():
    '''
    generate fake data for test
    '''
    client = Mongo(CONFIG["mongo"]["addr"], 'youtube')
    data = client.loadWholeDoc('vtuber')
    graph = networkx.Graph()
    vtubers = []
    for i in range(10):
        vtubers.append(data[i]['channel'])
        graph.add_node(data[i]['channel'], viz={}, mod=1, id=0)
    pos = networkx.random_layout(graph)
    counter = 0
    for v in vtubers:
        graph.node[v]['id'] = counter
        graph.node[v]['viz']['size'] = 20
        graph.node[v]['viz']['position'] = {
            'x': pos[v][0] * (-100),
            'y': pos[v][1] * 100,
            'z': 0
        }
        graph.node[v]["viz"]['color'] = {'r': 255, 'g': 192, 'b': 201, 'a': 1}
        print(graph.node[v])
        counter = counter + 1

    graph.add_weighted_edges_from([(vtubers[1], vtubers[5], 3),
                                   (vtubers[2], vtubers[4], 2),
                                   (vtubers[1], vtubers[3], 3),
                                   (vtubers[1], vtubers[7], 1),
                                   (vtubers[5], vtubers[9], 3),
                                   (vtubers[4], vtubers[8], 3),
                                   (vtubers[1], vtubers[8], 3),
                                   (vtubers[1], vtubers[2], 3),
                                   (vtubers[6], vtubers[7], 2)])
    #graph = networkx.generate_gexf(graph)
    # regen id
    networkx.write_gexf(graph, '../data/mock.gexf')
Example #7
0
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow

from db import Mongo, UseCache
from util import load_config

CONFIG = load_config()

SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

CLIENT_SECRETS_FILE = "client_secret_974874009372-e5f329adu30h1j8qmtc75r5vvgdngvbr.apps.googleusercontent.com.json"
DATABASE = Mongo(CONFIG["mongo"]["addr"], 'youtube')


class YouTube:
    '''
    this is the manager entity for  youtube api operation
    '''
    _db = None

    def __init__(self, secret):
        # get OAuth2 credential info
        # self._db = database
        flow = InstalledAppFlow.from_client_secrets_file(secret, SCOPES)
        credentials = flow.run_local_server()
        self.service = build(API_SERVICE_NAME,
                             API_VERSION,
Example #8
0
from translator import Translator
from scraper import Scraper
from selenium import webdriver
from db import Mongo
import datetime
import time
import requests
import subprocess
        
if __name__ == "__main__":
    driver = webdriver.Firefox()
    scraper = Scraper(driver)
    translator = Translator(driver)

    assets = ['itau', 'ambev', 'petrobras']
    db = Mongo(assets)

    today = datetime.datetime(2019, 4, 9) # Mudar a janela de tempo
    end_date = datetime.datetime(2020, 1, 1)
    total_days = (end_date - today).days

    for i in range(total_days):
        days_left = (end_date - today).days
        progress = ((i + 1) / (total_days * 1.0)) * 100

        print(f"Today is {today.strftime('%d/%m/%Y')}. There are {days_left} days left")
        print(f"Progress: {progress}%")

        for asset in assets:
            data = scraper.scrape_requests(asset, today)
this script is use for collect data on https://mamedaifuku.sakura.ne.jp/

'''
import re
from urllib import request
import json
import bs4
from db import Mongo, UseCache
from util import load_config

CONFIG = load_config()

COMMENT_URL_BASE = 'https://mamedaifuku.sakura.ne.jp/live_stream/php/ex_disp_message.php?v={}&disp_message_info_mode=2&disp_message_author_mode=1&disp_message_comment_mode=1&ym=&turning_page_mode=true&message_page=1'

COMMENT_FANS = "commFans"
database = Mongo(CONFIG["mongo"]["addr"], 'youtube')


def get_video_list():
    '''
    fecth all video from all users
    '''
    with request.urlopen(
            'https://mamedaifuku.sakura.ne.jp/live_stream/php/ex_return_video_list_json.php?get_video_list_mode=all'
    ) as res:
        # with open('../data/video.json', 'w+') as tmp:
        # tmp.write(res.read().decode())
        # database = Mongo(CONFIG["mongo"]["addr"], 'youtube')
        data = res.read().decode()
        # print(data)
        database.saveBulkToDoc('videosv2', json.loads(data))
Example #10
0
 def __init__(self):
     self.crawler = Crawler()
     self.db = Mongo()
Example #11
0
import re
from flask import Blueprint, jsonify, request
from config.config import Config
from db import Mongo
from app.paginate import Paginate

bp_rent = Blueprint('rent', __name__, url_prefix='/')

mongo = Mongo('192.168.99.100', 27017, 'rent591', 'houses').client()


@bp_rent.route('/' + Config.API_BASE_PATH + '/ans1', methods=['GET'])
def api_ans1():
    page = request.args.get('page', 1)
    gender = request.args.get('gender', '男生')
    region = request.args.get('region', '3')

    gender_limit = '男生' if gender == '女生' else '女生'

    query = {
        'gender_limit': {
            '$ne': gender_limit
        },
        'region': int(region),
    }
    result = mongo.find(query, {"_id": 0})

    p = Paginate(result, current_page=int(page))

    response = {
        'current_page': page,
Example #12
0
class HouseRentCrawler:
    """
    Crawl all houses data which in "TARGET_REGIONS"
    """

    TARGET_REGIONS = {
        '台北市': 1,
        '新北市': 3,
    }

    HOUSE_ID_QUEUE = Queue()
    HOUSE_DETAIL_QUEUE = multiprocessing.Queue()

    MONGO_DB = Mongo('192.168.99.100', 27017, 'rent591', 'houses')
    ES = Elastic('192.168.99.100', 9200, 'rent591', 'houses')

    def __init__(self, async_tasks_cnt=2):
        self.target_endpoint = 'https://rent.591.com.tw/'
        self.async_tasks_cnt = async_tasks_cnt

    def start(self):
        threading.Thread(target=self._start_get_house_ids).start()
        threading.Thread(target=self._start_get_house_detail).start()

        pool = multiprocessing.Pool(processes=3)
        while True:
            try:
                region_id, house_detail_info = self.HOUSE_DETAIL_QUEUE.get(block=True, timeout=60)

                pool.apply_async(self.save_house_data, args=(region_id, house_detail_info))
            except Empty:
                break
            except Exception:
                continue

    def _start_get_house_ids(self):
        tasks = []

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        for k in self.TARGET_REGIONS:
            tasks.append(
                asyncio.ensure_future(
                    self._get_house_ids(self.TARGET_REGIONS[k])
                )
            )

        loop.run_until_complete(asyncio.gather(*tasks))
        loop.close()

    def _start_get_house_detail(self):
        tasks = []

        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        for i in range(self.async_tasks_cnt):
            tasks.append(self._get_house_detail())

        loop.run_until_complete(asyncio.gather(*tasks))
        loop.close()

    async def _get_house_ids(self, region_id):
        """
        Get "totalRows" at first cycle (without "firstRow" & "totalRows" in params),
        then get all data until "firstRow" grater than "totalRows"
        """
        prd = self._get_pre_request_data(region_id)

        cookies = prd['cookies']
        headers = {'X-CSRF-TOKEN': prd['csrf_token']}
        params = {'kind': 0, 'region': region_id, 'type': 1, 'searchtype': 1}

        async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session:
            while (
                params.get('firstRow', 0) < int(params.get('totalRows', '0'))
                or not params.get('firstRow')
            ):
                try:
                    async with session.get(f'{self.target_endpoint}home/search/rsList', params=params) as response:
                        rslist = await response.json()

                        house_ids = [self._get_house_id(data) for data in rslist.get('data').get('data')]

                        for house_id in house_ids:
                            self.HOUSE_ID_QUEUE.put((region_id, house_id))

                        if params.get('firstRow'):
                            params['firstRow'] += 30
                        else:
                            params.update({
                                'firstRow': 30,
                                'totalRows': rslist.get('records', '0').replace(',', ''),
                            })
                except Exception:
                    print(f'error: {params}')
                    continue

    async def _get_house_detail(self):
        while True:
            try:
                region_id, house_id = self.HOUSE_ID_QUEUE.get(block=True, timeout=10)

                async with aiohttp.ClientSession() as session:
                    async with session.get(f'{self.target_endpoint}rent-detail-{house_id}.html') as response:
                        self.HOUSE_DETAIL_QUEUE.put((region_id, await response.text()))
            except Empty:
                break
            except Exception:
                print(f'error: {house_id}')
                continue

    @classmethod
    def save_house_data(cls, region_id, house_detail_info):
        data = HouseParser(house_detail_info, region_id).get_house_info()

        cls.MONGO_DB.save(data)
        cls.ES.save(data)

    def _get_pre_request_data(self, region_id):
        cookies = {'urlJumpIp': region_id}
        params = {'kind': 0, 'region': region_id}

        resp = requests.get(self.target_endpoint, params=params)
        soup = BeautifulSoup(resp.text, 'html.parser')

        csrf_token = soup.find("meta", attrs={"name": "csrf-token"}).get('content')
        cookies.update(resp.cookies)

        return {'csrf_token': csrf_token, 'cookies': cookies}

    def _get_house_id(self, data):
        return data.get('houseid')
Example #13
0
 def __init__(self, transactions_filepath):
     self.schema = 'Acme-Supermarket-Recommendations'
     self.transactions_filepath = transactions_filepath
     self.database = Mongo(self.schema)
Example #14
0
"""Web application."""

import json
import os

from flask import abort, Flask, jsonify, request
import pika

from db import Mongo

app = Flask(__name__)

mongo = Mongo(app)


@app.errorhandler(400)
def bad_request(e):
    """Jsonify 400 response."""
    resp = jsonify({'error': str(e)})
    resp.status_code = 400

    return resp


@app.route('/api/sms', methods=['GET', 'POST'])
def get_sms():
    """SMS endpoint for retrieving and sending messages."""
    if request.method == 'POST':
        data = request.get_json()
        if 'message' not in data or 'phone' not in data:
            abort(400)
Example #15
0
import os
from gensim.models import Word2Vec, KeyedVectors, FastText
from db import Mongo

DB = Mongo()

# 데이터 전처리 #################################################
db_result = list(DB.cursor()['gallery'].find({"pass": 1}))
result = [post['join'] for post in db_result]

#### HyperParameter
vec_size = 6
windows = 6
min_count = 10
iteration = 100
workers = 4

model = FastText(sentences=result,
                 size=vec_size,
                 window=windows,
                 min_count=min_count,
                 iter=iteration,
                 workers=workers)
model_result = model.wv.most_similar("AMD 5800X")
print(model_result)
 def __init__(self, encrypted_account_id):
     self.logger = Logger.Logger()
     self.persistency = Mongo.Mongo(self.logger)
     self.api = RiotClient.RiotClient(self.logger)
     self.encrypted_account_id = encrypted_account_id
Example #17
0
authlist = CONFIG["twitter"]["auth"]

twiAuthList = list(
    map(
        lambda a: twitter.oauth.OAuth(a["oauth_token"], a[
            "oauth_token_secret"], a["consumer_key"], a["consumer_secret"]),
        authlist))

special = [
    '914724274274832384', '803480007775383552', '1019885045933211648',
    '984028782175629314'
]
black = [746964642660966403]

twiList = list(map(lambda a: twitter.Twitter(auth=a), twiAuthList))
db = Mongo(mongoHost, "twitter")


class UserNotFoundException(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return (repr(self.value) + ' is not found.')


@UseCache(db=db, keyword='user_id')
def getUserProfile(apilist, user_id=''):
    '''
    get the profile of a user with given id
    '''
Example #18
0
from monitoring import Logger
from riot_api import PlayerCrawler
from db import Mongo

# ARAM
queue_id = 450
logger = Logger.Logger()
persistency = Mongo.Mongo(logger)

# seed for crawling
seed_encrypted_account_id = '-w9INIopYVNjHShnEGGgdYhREGEW407RXfAG6ltIjfEi_g'
PlayerCrawler.PlayerCrawler(seed_encrypted_account_id).run()

while True:
    match = persistency.get_uncrawled_game(queue_id)
    for participant_identity in match['participantIdentities']:
        encrypted_account_id = participant_identity['player'][
            'currentAccountId']
        if encrypted_account_id == seed_encrypted_account_id:
            pass
        else:
            PlayerCrawler.PlayerCrawler(encrypted_account_id).run()
    persistency.mark_match_as_crawled(match['gameId'])