Esempio n. 1
0
    def __init__(self):

        self.file_path = Path(__file__).parent

        self.model = Model('/Users/shihangyu/Scripts/python/stt_server/model/deepspeech-0.6.1-models/output_graph.pbmm',
                           aBeamWidth=500)

        self.desired_sample_rate = self.model.sampleRate()

        self.logger = getLogger(self.__module__)

        self.tmp_path = self.file_path / 'tmp.wav'
Esempio n. 2
0
    if m is None:
        print(url)

        return None

    path_parts = [p for p in m.group(2).split("/") if p != '']

    if len(path_parts) <= 1:
        print(url)

        return None
    label = "_".join(path_parts[:-1])
    cli.set(f"{label}|{url}", 1)

if __name__ == '__main__':
    logger = getLogger("cnn")

    fw = open("/hdd/crawl_result/english_classification/crawl_edition.cnn.com_01.json_extract.json.json", "w")

    with open("/hdd/crawl_result/english_classification/crawl_edition.cnn.com_01.json_extract.json", "rb") as fr:
        for lineno, line in enumerate(fr):
            line = line.decode("utf-8").strip()
            jobj = json.loads(line)
            jobj['source'] = 'cnn'
            t = jobj['content'][0]
            jobj['content'][0] = re.sub("\(CNN ?\w*\)", lambda m: m.group(0)+' ',t)

            fw.write(json.dumps(jobj) + '\n')


import argparse
from pathlib import Path

from tqdm import tqdm

from processor.asr.yitu_asr_processor import yitu_asr_wrapper
from util.log_util import getLogger
from util.redis_util import getRedisClient
from util.util import mapLineCount

logger = getLogger('read_selenium_output')

from postprocess.line_processors import *

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--input', required=True)
    parser.add_argument('--pattern')
    parser.add_argument('--postfix')

    args = parser.parse_args()

    input_path = Path(args.input)

    assert input_path.exists()

    if input_path.is_dir():
        assert args.pattern is not None
        logger.info(f'find files in {input_path}/{args.pattern}')
        files = [file for file in input_path.rglob(args.pattern)]
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from typing import Optional
from util.log_util import getLogger

logger = getLogger("guardian")

culture_sencond_level_labels = [
    "books", "music", "tv-and-radio", "artanddesign", "film", "games", "stage"
]
food_sencond_level_labels = ["food"]
travel_sencond_level_labels = ["travel"]
travel_sencond_level_labels = ["travel"]


def extract(line: str, line_key: str, *args) -> Optional[str]:
    jobj = json.loads(line)

    url = jobj['url']
    url_parsed = urlparse(url)

    if url_parsed.netloc != "www.theguardian.com":
        print(f"{line_key} {jobj['url']} error: invalid netloc")
        return None

    # sencond_level_label = [p for p in url_parsed.path.split('/') if p != ''][0]
    #
    # if sencond_level_label not in food_sencond_level_labels:
    #     print(f"{line_key} {jobj['url']} error: unknown label")
    #     return None
Esempio n. 5
0
import json
import re
import time

import requests

from util.log_util import getLogger
from util.redis_util import getRedisClient
from typing import Optional

logger = getLogger("huffpost_apis")

headers = {
    "User-Agent":
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'
}

sections = [
    'sports', 'entertainment', 'business', 'science', 'technology',
    'relationships', 'women', 'religion', 'travel', 'green', 'taste'
]

topics = [
    'the-worldpost', 'politics', 'college', 'education', 'divorce', 'weddings',
    'arts-and-culture', 'art', 'environment', 'health-living',
    'health-and-wellness', 'worklife'
]

category_labels = {
    'the-worldpost': (["world"], 0),
    'sports': (["sports"], 1),
import argparse
from pathlib import Path

from tqdm import tqdm
from util.util import mapLineCount

from postprocess.pool_wrapper import PoolWrapper
from classification_data_eng.guardian.guardian_extractor import extract
from classification_data_eng.check_label_count import redis_deduplicate
from util.log_util import getLogger
from util.redis_util import getRedisClient

if __name__ == '__main__':
    pw = PoolWrapper(redis_deduplicate)

    logger = getLogger('process_line_by_line_multiprocess')

    parser = argparse.ArgumentParser()

    parser.add_argument('--input', required=True)
    parser.add_argument('--pattern')
    parser.add_argument('--postfix', required=True)
    parser.add_argument('--save_result', action='store_true')
    parser.add_argument('--save_redis', action='store_true')
    parser.add_argument('--redis_db', type=int, default=0)

    args = parser.parse_args()

    save_result = args.save_result

    save_redis = args.save_redis
Esempio n. 7
0
import json
import re
from urllib.parse import unquote
import requests

from util.log_util import getLogger
from util.redis_util import getRedisClient

from bs4 import BeautifulSoup

logger = getLogger("daypop_apis")
categorys = [
    "UAE", "Arab", "World", "Entertainment", "Sport", "ScienceTechnology",
    "Business", "Health"
]


def get_list_by_category(category: str, page: int, lang: str):
    url = f"https://api.daypop.ai/daypop/v1/channel/{category}?lang={lang}&page={page}"

    response = requests.get(url)

    assert response.status_code == 200, f"Request {get_list_by_category.__name__}:{category}:{page} Faild with code {response.status_code}"

    res = json.loads(response.text)["data"][category]

    logger.debug(
        f"{get_list_by_category.__name__}:{category}:{page} have list of [{len(res)}] articles"
    )

    return res
Esempio n. 8
0
import json
import os
import time
from pathlib import Path

import requests

from util.log_util import getLogger

logger = getLogger('AcademiaApis')


def login():
    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Origin': 'https://academia-arabia.com',
        'Upgrade-Insecure-Requests': '1',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
        'Sec-Fetch-Dest': 'document',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '******',
        'Referer': 'https://academia-arabia.com/Account/Login',
        'Accept-Language': 'en-US,en;q=0.9,mt;q=0.8',
    }
Esempio n. 9
0
import json
import math
import random

from util.log_util import getLogger

logger = getLogger('line_processors')


def random_pick(line: str):
    '''
    input line , output processed line or None when failed
    :param line:
    :return:
    '''
    try:
        json_obj = json.loads(line)
    except json.JSONDecodeError:
        logger.error('JSONDecodeError')
        return None
    ar_sen = json_obj.get('ar_sen')
    if random.random() > 1 / 5:
        return None
    if ar_sen is None:
        logger.warning(line)
        return None

    return line


def print_len(line: str):
Esempio n. 10
0
filename = args.filename
'''
curl "https://academia-arabia.com/Pages/72131/76/${page}/false/1/2" 
-H 'Sec-Fetch-Mode: cors' 
-H 'Sec-Fetch-Site: same-origin' 
-H 'Accept-Language: zh-CN,zh;q=0.9' 
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36' 
-H 'Accept: */*' 
-H 'Referer: https://academia-arabia.com/Assets/Viewer/oldVersion/build/pdf.worker.js' 
-H 'Sec-Fetch-Dest: empty' 
-H 'Cookie: _ga=GA1.2.355087959.1583647731; _gid=GA1.2.1177952183.1583647731; _culture=en-US; __gads=ID=cdb3a5f28402c03d:T=1583666253:S=ALNI_MaIYp3HDwAT_skRJlxFdrHF5Dfjvw; cb-enabled=accepted; ASP.NET_SessionId=ozkti0ya22cnt5umbxgwj55z; APP-SRV=1; _gat_gtag_UA_23555050_5=1' 
-H 'Connection: keep-alive'
'''
redis_cli = getRedisClient(db=0)

logger = getLogger('Download')

books = readBooks(filename)
books = sorted(books, key=lambda d: d['ORG'])

cookie = login()
if cookie is None:
    exit('log in failed')

for iter_id, book in enumerate(books):
    book_id = book.get('ORG')
    total_pages_num = book.get('NumOfPages')

    logger.info(f'Downloading book {book_id}')

    # page_id from 1 to total_pages_num
Esempio n. 11
0
import json
from html import unescape
from typing import Optional, List

from lxml import html

from util.log_util import getLogger

logger = getLogger(__file__)


def htmlResponseToLines(response: str) -> Optional[List[str]]:
    '''
    1. split by nltk and \n
    2. remove continuing blank
    3. deduplicate on one page

    :param response:
    :return: list of lines if success, None if fail
    '''

    json_obj = json.loads(response)

    response = json_obj.get('response')

    try:
        # soup = BeautifulSoup(response, 'html.parser')

        # lines2d = [splitLine(para.get_text()) for para in soup.body.find_all(re.compile(r'^p$|^h[1-6]$|^span$|^a$|^li$'))]

        # lines = list(itertools.chain(*lines2d))
import os
import time
import uuid
from pathlib import Path

import librosa
import numpy as np
import requests
import scipy.io.wavfile as wavfile
import soundfile as sf

from util.log_util import getLogger
from util.redis_util import getRedisClient
from util.time_util import get_utc_timestamp

logger = getLogger("YiTUASR")


def get_HmacSha256(message, secret_key):
    return hmac.new(bytes(secret_key, 'latin-1'),
                    msg=bytes(message, 'latin-1'),
                    digestmod=hashlib.sha256).hexdigest()


def check_amend_wav(filename: str, amend_after_check):
    assert Path(filename).is_file(), "input file not exists"

    tmpfiles = []

    audio_file = sf.SoundFile(filename)
    audio_seconds = len(audio_file) / audio_file.samplerate
Esempio n. 13
0
import json
import bs4
import re
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse

from util.log_util import getLogger
logger = getLogger("bbc_extractor")


def extract_food(line: str, line_key: str, *args) -> Optional[str]:
    jobj = json.loads(line)
    url = jobj['url']
    url_parsed = urlparse(url)

    try:
        label_id = 0

        response = jobj['response']
        soup = BeautifulSoup(response, 'html.parser')

        title_node = soup.select("h1.blocks-article__headline")

        if len(title_node) != 1:
            logger.error(f"no title {line_key} {url}")
            return None
        title = title_node[0].get_text()

        # **************** content ***************
Esempio n. 14
0
import json
import mmap
import time
from datetime import datetime

from util.log_util import getLogger
from util.redis_util import getRedisClient
from util.regex_util import REGPATTERNS

logger = getLogger('Util')

from bisect import bisect_left


def mapLineCount(filename):
    try:
        f = open(filename, "r+")
        buf = mmap.mmap(f.fileno(), 0)
        lines = 0
        readline = buf.readline
        while readline():
            lines += 1
        return lines
    except:
        return 0


def mapLineCharCount(filename):
    try:
        f = open(filename, "r+")
        buf = mmap.mmap(f.fileno(), 0)
Esempio n. 15
0
import json
import pickle
from pathlib import Path
from typing import List

from util.log_util import getLogger

logger = getLogger('read_books_json')


def readBooks(filename) -> List[dict]:
    '''
    get list of book dict from local json file
    :return:
    '''

    book_id_set = set()

    books = []

    with open(Path(__file__).with_name(filename), 'r', encoding='utf-8') as f:
        for line in f:
            try:
                jobj = json.loads(line.strip())
            except json.JSONDecodeError:
                logger.error('JSONDecodeError')
                continue

            if jobj['ORG'] in book_id_set:
                logger.warning(f"duplicate book {jobj['ORG']}")
            else: