def export_serialized(df, column='text', path=None):
    '''
    Serialize column to a dictionary,
    where keys are ID and values are col.

    Parameters
    ----------
    df : pd.DataFrame
        dataframe to unpack

    column : str (default: 'text')
        name of df's column to export

    path : str, optional
        where to save the resulting .ndjson object
    '''

    # get ID column
    df_id = (df.reset_index().rename(columns={'index': 'ID'}))

    # convert data to list of dicts
    serial_output = []
    for i, row in df_id.iterrows():
        doc = {'ID': row['ID'], column: row[column]}
        serial_output.append(doc)

    # if path is specified, save & be silent
    if path:
        with open(path, 'w') as f:
            ndjson.dump(serial_output, f)
        return None

    # if no path, return list of dicts
    else:
        return serial_output
Exemple #2
0
def make_dataset(
    texts_path: Path,
    num_chars: int,
    eliminate_words: List[str],
    out_text: Path,
    out_char: Path,
):
    texts: List[str] = texts_path.open(encoding='UTF8').read().split()
    texts = list(
        filter(lambda s: all(w not in s for w in eliminate_words), texts))

    counter = Counter("".join(texts))
    chars = "".join(c[0] for c in counter.most_common(num_chars))

    texts = list(
        filter(lambda s: not contain_unknown_chars(s, chars=chars), texts))

    ndjson.dump([{
        "str": s
    } for s in texts],
                out_text.open('w', encoding='UTF8'),
                ensure_ascii=False)
    json.dump([c for c in chars],
              out_char.open('w', encoding='UTF8'),
              ensure_ascii=False)

    # show alphabet
    for c in "abcdefghijklmnopqrstuvwxyz":
        if c in chars:
            print(c, chars.index(c))
        else:
            print(c, "not exist")
Exemple #3
0
def mk_traindata(filename):

    output_list = []
    print("ラベルを入力してenterを押してください。")
    kana = sys.stdin.read(1)
    print("開始")
    c = 0
    try:
        while True:
            data, command = record()
            if len(data) == 1:
                continue
            else:
                c += 1
                data = list(map(lambda x: [x[0], x[1]], data))
                print(data)
                sys.stdout.write(
                    "{}回目の入力をしました。やめたい時はCtrl+Cを押してください。\n".format(c))
                list(map(lambda x: x.insert(1, 'down'), data))
                dic = {"kana": kana, "events": data}
                output_list.append(dic)
            if c % 10 == 0:
                with open(os.path.join('../../data/', filename), 'a') as f:
                    ndjson.dump(output_list, f)
                    f.write("\n")
                print("ここまでの入力を保存しました")
                output_list = []
    except KeyboardInterrupt:
        with open(os.path.join('../../data/', filename), 'a') as f:
            ndjson.dump(output_list, f)
Exemple #4
0
def _url_to_ndjson(target_url: str, ndjson_folder: Union[Path, str]):
    """Fetch json formatted data from a specific CBS table url and write each page as n ndjson file.

    Parameters
    ----------
    target_url : str
        The url to fetch from
    ndjson_folder : str or Path
        The folder to store all output files

    Returns
    -------
    list of dicts
        All entries in url, as a list of dicts

    Raises
    ------
    FileNotFoundError
        if no values exist in the url
    """

    logger.debug(f"load_from_url: url = {target_url}")
    r = requests.get(target_url).json()
    if r["value"]:
        # Write as ndjson
        filename = (
            f"page_{int(target_url.split('skip=')[-1])//10000}.ndjson"  # TODO: this is built for v3 - v4 datasets inappropriately names the files page_10, page_20, page_30, etc.
            if "skip" in target_url else "page_0.ndjson")
        path = Path(ndjson_folder) / Path(filename)
        with open(path, "w+") as f:
            ndjson.dump(r["value"], f)
        return path
    else:
        return None
def get_segmented_reviews(retrievepath, savepath):
    '''This method takes in retrieve path to get the data source json file
    and dump the data into other file in save path, after doing sentence
    segmentation on reviews.'''

    # open the json file and read the reviews in.
    # The file is actually ndjson(seperated by newlines not commas)
    try:
        with open(retrievepath, encoding='latin-1') as f:
            datastore = ndjson.load(f)
    except IOError:
        print('An error occurred trying to read the file.')

    # using sent_tokenize() to split a review text into a list of sentences.
    for review in datastore:
        review['text'] = sent_tokenize(review['text'])
        # number of sentence in each review text
        review['num_sentence'] = len(review['text'])

    # save the sengmented comments to data folder for further analysis
    try:
        with open(savepath, 'w+') as f:
            ndjson.dump(datastore, f)
    except IOError:
        print('An error occurred trying to save the file.')
def write_to_ndjson_gz_file(data: List[Dict], output_file: PathLike):
    output_file = Path(output_file)
    if not output_file.name.endswith(".ndjson.gz"):
        raise ValueError("Output file must end with .ndjson.gz")
    ndjson_file = output_file.parent / output_file.stem
    with ndjson_file.open('w') as f:
        ndjson.dump(data, f)
    gzip_file(ndjson_file, output_file, keep=False)
def write_to_file(file_name: int, index: defaultdict(list)):
    """

    Write index in separated files after merge.
    """
    dict_for_index = sorted(
        index.items())  # sort posting lists
    with open(FINAL_INDEX_PATH + "{}.json".format(file_name), 'w') as file:
        ndjson.dump(dict_for_index, file)
Exemple #8
0
def calculate(doc_top_prob,
              ID,
              window: int,
              out_dir=None,
              curb_incomplete=False):
    '''Calculate Novelty, Transience & Resonance on a single window.
    This function is wrapped in process_windows() - see it for details.
    '''

    # make sure there is a folder to save it
    if out_dir:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

    # signal calculation
    idmdl = InfoDynamics(data=doc_top_prob,
                         time=ID,
                         window=window,
                         weight=0,
                         sort=False)
    idmdl.novelty(meas=jsd)
    idmdl.transience(meas=jsd)
    idmdl.resonance(meas=jsd)

    lignes = list()
    for i, doc_id in enumerate(ID):
        d = dict()
        d["doc_id"] = doc_id
        # HACK because of IndexError
        try:
            d["novelty"] = idmdl.nsignal[i]
            d["transience"] = idmdl.tsignal[i]
            d["resonance"] = idmdl.rsignal[i]
            d["nsigma"] = idmdl.nsigma[i]
            d["tsigma"] = idmdl.tsigma[i]
            d["rsigma"] = idmdl.rsigma[i]
        except IndexError:
            print("[info] there was an Index Error, proceed with caution")
            pass

        lignes.append(d)

    if curb_incomplete:
        # keep only rows with full records
        lignes = lignes[window:-window]

    if out_dir:
        # make a filename
        filename = str(window) + 'W' + '.ndjson'
        outpath = os.path.join(out_dir, filename)

        # export
        with open(outpath, "w") as f:
            ndjson.dump(lignes, f)

    return None
Exemple #9
0
def parse_dataset(path=RAW_DIR_NAME, decode=None, early_return=True):
    """
    Restructures dataset from '.ndjson' files into folders. Each folder will be
    of the form 'dataset/{LABEL}' and will contain 1 file per training example.
    Also saves the list of all filenames to 'filenames.txt'.

    @param path - str: path to directory containing dataset
    @param decode - None or "jpg" - how to decode training examples
    @param early_return - bool: indicates whether method should return early
        if 'filenames.txt' already exists

    @returns list containing all the filenames of the training examples 
        (relative to path)
    @returns list containing all the labels of the dataset
    """
    list_ids = []
    labels = set()

    # If the filenames.txt file already exists, parse the file to find
    # list_ids and labels, and return early
    if decode == 'jpg':
        list_ids_filename = os.path.join(path, '../img/' + 'filenames.txt')
    else:
        list_ids_filename = os.path.join(path, 'filenames.txt')
    if early_return and os.path.exists(list_ids_filename):
        with open(list_ids_filename) as f:
            list_ids = ndjson.load(f)
        for list_id in list_ids:
            label = os.path.basename(os.path.dirname(list_id))
            labels.add(label)
        return list_ids, list(labels)

    # Loop through all '.ndjson' files and split into individual files
    pool = mp.Pool(mp.cpu_count())
    files = os.listdir(path)
    files = [f for f in files if os.path.splitext(f)[1] == '.ndjson']
    list_ids_temp = []

    parse = functools.partial(parse_label, path=path, decode=decode)
    pool.map_async(parse, files, callback=list_ids_temp.extend)
    pool.close()
    pool.join()

    # Convert list_ids_temp from list of lists to just a list
    list_ids = []
    for list_id in list_ids_temp:
        list_ids += list_id

    # Write output to 'dataset/filename.txt' and find all labels
    with open(list_ids_filename, 'w') as f:
        ndjson.dump(list_ids, f)
    for list_id in list_ids:
        label = os.path.basename(os.path.dirname(list_id))
        labels.add(label)
    return list_ids, list(labels)
Exemple #10
0
def run_eval(opt=None,
             model=None,
             loader=None,
             dataset='test',
             write_result=False):
    if opt is None:
        opt = TestOptions().parse()
    if model is None:
        model = SketchModel(opt)
    if loader is None:
        loader = load_data(opt,
                           datasetType=dataset,
                           permutation=opt.permutation)
    # print(len(loader))
    if opt.eval_way == 'align':
        predictList, lossList = eval_align_batchN(model,
                                                  loader,
                                                  P=opt.points_num)
    elif opt.eval_way == 'unalign':
        predictList, lossList = eval_unalign_batch1(model, loader)
    else:
        raise NotImplementedError('eval_way {} not implemented!'.format(
            opt.eval_way))
    # print(predictList.shape)
    testData = []
    with open(
            os.path.join('data', opt.dataset, 'train',
                         '{}_{}.ndjson'.format(opt.class_name, dataset)),
            'r') as f:
        testData = ndjson.load(f)

    if opt.metric_way == 'wlen':
        p_metric_list, c_metric_list = eval_with_len(testData, predictList)
    elif opt.metric_way == 'wolen':
        p_metric_list, c_metric_list = eval_without_len(testData, predictList)
    else:
        raise NotImplementedError('metric_way {} not implemented!'.format(
            opt.metric_way))

    if write_result:
        testData = get_eval_result(testData, predictList)
        result_path = os.path.join(
            'data', opt.dataset, 'train',
            '{}_{}.ndjson'.format(opt.class_name, 'res'))
        with open(result_path, 'w') as f:
            ndjson.dump(testData, f)

    loss_avg = np.average(lossList)
    P_metric = np.average(p_metric_list)
    C_metric = np.average(c_metric_list)
    # print('P_metric:{:.4}%\tC_metric:{:.4}%'.format(P_metric*100, C_metric*100))

    return loss_avg, P_metric, C_metric
def writer(file_res, fname, outdir):
    # check for outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    # file extension
    out_path = os.path.join(outdir, fname + '.ndjson')

    # export
    with open(out_path, 'w') as fout:
        ndjson.dump(file_res, fout)

    return None
def generate(filename, lang):
    with open(filename) as i:
        items = ndjson.load(i)

    translated_items = generate_synthetic_data(items, lang)

    basename = os.path.basename(filename)
    basename_without_extension = os.path.splitext(basename)[0]
    output_name = "{}_{}.ndjson".format(basename_without_extension, lang)

    with open(output_name, "w") as o:
        ndjson.dump(translated_items, o)
Exemple #13
0
def agregando():
    try:
        key, value = word.split('->')

        if key in agregar:
            tempDictionary[key] = value
    except:
        key, value1, value2 = word.split('->')
        # if key in acceptedFields:
        tempDictionary[key] = value1 + value2

#EXTRACT
#with codecs.open(rawPath, 'r', encoding='utf-8', errors='ignore') as rawFile:
#      lines = rawFile.readlines()
#      tempDictionary = {}
#      total = 0
#CLEAN
#      for line in lines:
#          if 'BREAK-REVIEWED' in line:
#             tempDictionary = {}
#          elif re.search('[0-9]+\.[0-9]+', line):
#
#              tempDictionary['price'] = float(re.findall(
#                  '[0-9]+\.[0-9]+', line)[0])
#          else:
#            #Cleaning
#              cleanLine = line.split()
#              cleanLine = " ".join(cleanLine)
#              cleanLine = cleanLine.replace('<->', ' - ')
#              cleanLine = cleanLine.replace('ice>Link Plus -> ', '')
#              cleanLine = cleanLine.replace('\"', '')
#              cleanLine = cleanLine.replace('.', '')
#              cleanLine = cleanLine.replace("'", "")
#              soup = BeautifulSoup(cleanLine, "html.parser")
#              cleanLine = soup.get_text()
#              # if re.search('[0-9]+["] ', cleanLine):
#              # cleanLine = cleanLine.replace('"','')
#              cleanLine = cleanLine.lower()
#            #Find KeyValues
#              words = re.findall('\S+->.*?(?= \S+->|$)', cleanLine)
#
#              #Get Data
#              for word in words:
#
#                  agregando();
#
#              if(set(agregar).issubset(tempDictionary)):
#                  productList.append(tempDictionary.copy())

#GUARDAR LOS DATOS LIMPIOS EN UN JSON
    with open('backup.json', 'w') as f:
        ndjson.dump(productList, f, sort_keys=True, cls=DecimalEncoder)
Exemple #14
0
def fetch_kibana_object(obj_type, exportpath):
    try:
        print('# Fetching kibana objects: %s' % obj_type)
        response = requests.post(KIBANA_OBJECTS_EXPORT_URL,
                                 json={'type': obj_type},
                                 verify=False,
                                 auth=(KIBANA_USER, KIBANA_PASS),
                                 headers={'kbn-xsrf': 'true'})
        if response.status_code != 200:
            print('!!! Error fetching kibana object %s: HTTP status code %s' %
                  (obj_type, response.status_code))
        else:
            rawData = response.text.encode('utf-8')
            items = ndjson.loads(rawData)
            if obj_type != 'index-pattern':
                toExport = []
                for ip in items:
                    if 'attributes' in ip.keys(
                    ) and 'title' in ip['attributes']:
                        if re.match(REDELK_OBJ_FILTER,
                                    ip['attributes']['title'], re.IGNORECASE):
                            ip.pop('updated_at', None)
                            ip['version'] = '1'
                            toExport.append(ip)
                export_file = os.path.join(
                    exportpath,
                    '%s%s.ndjson' % (EXPORT_FILES_PREFIX_KIBANA, obj_type))
                print('\tExporting %s: %s' % (obj_type, export_file))
                with open(export_file, 'w') as f:
                    ndjson.dump(toExport, f)
            else:
                for ip in items:
                    if 'attributes' in ip.keys(
                    ) and 'title' in ip['attributes']:
                        if re.match(INDEX_PATTERNS_FILTER,
                                    ip['attributes']['title'], re.IGNORECASE):
                            # print('%s: %s' % (obj_type,ip['attributes']['title']))
                            pn = ip['attributes']['title'][:-2] if ip[
                                'attributes']['title'].endswith(
                                    '-*') else ip['attributes']['title']
                            ip.pop('updated_at', None)
                            ip['version'] = '1'
                            export_file = os.path.join(
                                exportpath, '%s%s_%s.ndjson' %
                                (EXPORT_FILES_PREFIX_KIBANA, obj_type, pn))
                            print('\tExporting %s: %s' %
                                  (obj_type, export_file))
                            with open(export_file, 'w') as f:
                                ndjson.dump([ip], f)
    except Exception as e:
        print('!!! Error fetching kibana object %s: %s' % (obj_type, e))
def generate_nd_json(*args, **kwargs):
    item_number = 10
    users = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
    events = ['tutorial', 'iap']
    items = [{
        'user': random.choice(users),
        'timestamp': str(random_timestamp.random_timestamp(2020)),
        'evtname': random.choice(events),
        'spend': round(random.random(), 2)
    } for i in range(item_number)]
    # dump to file-like objects
    with open(
            os.path.join('/home/airflow/gcs', 'data', 'test',
                         f"{kwargs['execution_date']}.ndjson"), 'w') as f:
        ndjson.dump(items, f)
def test_validate_ndjson_uuid(tmp_path, project):
    file_name = f"repeat_uuid.ndjson"
    file_path = tmp_path / file_name
    repeat_uuid = PREDICTIONS.copy()
    repeat_uuid[0]['uuid'] = 'test_uuid'
    repeat_uuid[1]['uuid'] = 'test_uuid'

    with file_path.open("w") as f:
        ndjson.dump(repeat_uuid, f)

    with pytest.raises(UuidError):
        project.upload_annotations(name="name", annotations=str(file_path))

    with pytest.raises(UuidError):
        project.upload_annotations(name="name", annotations=repeat_uuid)
def get_ab_test_information(deviceType):
    # Select authentication keys for App
    if deviceType.upper() == 'IOS':
        APP_ID = IOS_APP_ID_KEY
        EXPORT_KEY = IOS_EXPORT_KEY
        CONTENT_KEY = IOS_CONTENT_READ_ONLY_KEY
    elif deviceType.upper() == 'ANDROID':
        APP_ID = ANDROID_APP_ID_KEY
        EXPORT_KEY = ANDROID_EXPORT_KEY
        CONTENT_KEY = ANDROID_CONTENT_READ_ONLY_KEY
    else:
        #TODO: How do you make sure that if this happens, airflow knows it was a failed job?
        logging.ERROR(f'{job_name} - get_ab_test_information: Device not one of IOS or Android. Stopping Application')
        return 'Device not one of IOS or Android'

    # Construct abTests URL
    getABTests_url = getABTests_api+f'''&appId={APP_ID}&clientKey={CONTENT_KEY}&apiVersion=1.0.6'''

    # Get AB Tests Information from API
    logging.info(f'''{job_name} - get_ab_test_information: Requesting list of A/B Tests from API for {deviceType}''')
    http = urllib3.PoolManager(
        cert_reqs='CERT_REQUIRED',
        ca_certs=certifi.where()
    )
    response = http.request('GET', getABTests_url)
    response_return = response.data.decode('utf-8')
    response_json = json.loads(response_return)

    # Check response for successful data pull
    logging.info(f'''{job_name} - get_ab_test_information: Checking response for successful data retrieval''')

    if response_json['response'][0]['success'] == True and response.status == 200:
        logging.info(f'''{job_name} - get_ab_test_information: Data successfully retrieved''')
        data = response_json['response'][0]

        json_file = '/Users/gkaberere/spark-warehouse/leanPlum/saved_json_ndjson.json'

        with open(json_file, 'w') as file:
            ndjson.dump(data, file)


    else:
        #TODO: How do you make sure that if this happens airflow knows it's a failed job
        logging.error(f'''{job_name} - get_ab_test_information: Response did not meet success = True or have a 
        response status = 200''')
        return

    return
def test_create_from_local_file(tmp_path, project):
    name = str(uuid.uuid4())
    file_name = f"{name}.ndjson"
    file_path = tmp_path / file_name
    with file_path.open("w") as f:
        ndjson.dump(PREDICTIONS, f)

    bulk_import_request = project.upload_annotations(
        name=name, annotations=str(file_path))

    assert bulk_import_request.project() == project
    assert bulk_import_request.name == name
    assert bulk_import_request.error_file_url is None
    assert bulk_import_request.status_file_url is None
    assert bulk_import_request.state == BulkImportRequestState.RUNNING
    __assert_file_content(bulk_import_request.input_file_url)
def test_create_from_local_file(tmp_path, predictions, configured_project):
    name = str(uuid.uuid4())
    file_name = f"{name}.ndjson"
    file_path = tmp_path / file_name
    with file_path.open("w") as f:
        ndjson.dump(predictions, f)

    bulk_import_request = configured_project.upload_annotations(
        name=name, annotations=str(file_path), validate=False)

    assert bulk_import_request.project() == configured_project
    assert bulk_import_request.name == name
    assert bulk_import_request.error_file_url is None
    assert bulk_import_request.status_file_url is None
    assert bulk_import_request.state == BulkImportRequestState.RUNNING
    assert_file_content(bulk_import_request.input_file_url, predictions)
def write_dict_to_file(
    file_number: int, dict_for_index: defaultdict(list)) -> int:
    """
    This function write parts of index from texts blocks in tmp index file.
    First stage of creating index with SPIMI.
    """
    for key in dict_for_index.keys():
        term_freqs = Counter(dict_for_index[key])  # count terms freqs in docs
        dict_for_index[key] = term_freqs
    dict_for_index: dict = sorted(dict_for_index.items())
    with open("data/index_blocks/index_file{}.txt".format(str(file_number)),
              "w") as f:
        ndjson.dump(dict_for_index, f)
        # write as json for comfortable reading lines
        file_number += 1  # this var is for naming tmp index files
    return file_number
Exemple #21
0
def save_training_example(drawing, path, decode=None):
    """
    Saves a single training example to the directory of specified path. The
    filename will be set to the key_id.

    @param drawing - dict: raw data from the Quick! Draw dataset with keys 
                           'word', 'key_id', and 'drawing'
    @param decode (None or "jpg"): whether to decode sketches as images. By
                            default, sketches are saved as ndjson files.
    @param path - str: folder where training examples will be stored.

    @returns str - the filename where the training example is saved.
    """
    ext = '.jpg' if decode == 'jpg' else '.ndjson'
    filename = os.path.join(path, drawing['key_id'] + ext)
    if not os.path.exists(filename):
        drawing_simplified = [{
            'word': drawing['word'],
            'key_id': drawing['key_id'],
            'drawing': drawing['drawing']
        }]
        if decode == 'jpg':
            drawing_decoded = decode_drawing(drawing['drawing'])
            cv2.imwrite(filename, drawing_decoded)
        else:
            with open(filename, mode='w') as f:
                writer = ndjson.dump(drawing_simplified, f)
    result = os.path.join(drawing['word'], drawing['key_id'] + ext)

    # Return only the label with the key_id for sake of space.
    return result
def concat_texts_timebins(lemma_path, metadata_path, outdir, timebin='10Min'):
    '''
    '''
    # load files
    with open(lemma_path) as fin:
        file_lemma = ndjson.load(fin)

    with open(metadata_path) as fin:
        file_meta = ndjson.load(fin)
        timestamps = [doc['start'] for doc in file_meta]
        del file_meta

    # resample
    df_resampled = (pd.DataFrame(
        file_lemma, index=pd.to_timedelta(timestamps)).resample(timebin).sum())

    # get rid of 0 (no document in time bin)
    df_resampled = (df_resampled.replace(0, np.nan).dropna())

    # get rid of [] (there is a document but no features in time bin)
    df_resampled['text'] = df_resampled.text[df_resampled.text.apply(len) > 0]
    df_resampled = (df_resampled.dropna().reset_index())

    # get timestamp as str
    df_resampled['time'] = df_resampled['index'].astype(str).str.extract(
        'days (.*?)\.')

    # serialize
    file_res = []
    for i, row in df_resampled.iterrows():
        res = dict()
        res.update({
            'time': row.time,
            'text': row.text,
            'lemma': row.lemma,
            'pos': row.pos,
            'dep': row.dep,
            'ner': row.ner
        })
        file_res.append(res)

    outfname = os.path.basename(lemma_path)
    with open(os.path.join(outdir, outfname), 'w') as fout:
        ndjson.dump(file_res, fout)

    return None
Exemple #23
0
def test_validate_ndjson_uuid(tmp_path, configured_project, predictions):
    file_name = f"repeat_uuid.ndjson"
    file_path = tmp_path / file_name
    repeat_uuid = predictions.copy()
    repeat_uuid[0]['uuid'] = 'test_uuid'
    repeat_uuid[1]['uuid'] = 'test_uuid'

    with file_path.open("w") as f:
        ndjson.dump(repeat_uuid, f)

    with pytest.raises(MALValidationError):
        configured_project.upload_annotations(name="name",
                                              annotations=str(file_path))

    with pytest.raises(MALValidationError):
        configured_project.upload_annotations(name="name",
                                              annotations=repeat_uuid)
Exemple #24
0
def run_standard(paths, outdir):
    '''
    Alternative:
    forbidden_pos=['PUNCT', 'CONJ', 'CCONJ', 'DET', 'SYM']
    '''
    # iterate through files
    for path in tqdm(paths):
        # get file name
        fname = os.path.basename(path)
        outpath = os.path.join(outdir, fname)

        # open file
        with open(path) as f:
            session = ndjson.load(f)

        # extend stopwords
        # stopwords = get_first_name_set()
        stopwords = set()
        stopwords.update([
            # 'marisha',
            # 'liam',
            # 'travis',
            # 'keyleth',
            # 'ane',
            # 'taliesin',
            # 'hav',
            # 'orion',
            '-pron-',
            "'s",
            'yeah',
            'guy'
        ])

        # process all docs in that file
        features = extract_features(
            session,
            tokentype='lemma',
            forbidden_pos=['PUNCT', 'CONJ', 'CCONJ', 'DET', 'SYM'],
            langs='en',
            # filter out names
            extended_stopwords=stopwords)

        # export
        with open(outpath, 'w') as f:
            ndjson.dump(features, f)
Exemple #25
0
def fetch_data_json(fn, query, fields, format_fn, upload):
    try:
        conn = mysql.connector.connect(
            host=os.environ.get("SUMO_MYSQL_HOST", "localhost"),
            port=os.environ.get("SUMO_MYSQL_PORT", 3306),
            database=os.environ.get("SUMO_MYSQL_DB_NAME", "kitsune"),
            user=os.environ.get("SUMO_MYSQL_USERNAME", "root"),
            password=os.environ.get("SUMO_MYSQL_PASSWORD", ""))
        #        conn = mysql.connector.connect(host='127.0.0.1', port=3306,
        #                                       database='kitsune',
        #                                       user='******')
        #                                       #password='******')
        if conn.is_connected():
            print('Connected to MySQL database')

        cursor = conn.cursor()
        cursor.execute(query)
        row_headers = [x[0] for x in cursor.description
                       ]  #this will extract row headers
        print(row_headers)
        rows = cursor.fetchall()
        json_data = []
        rownum = 0
        for row in rows:
            rownum = rownum + 1
            json_data.append(dict(zip(row_headers, row)))
            if rownum % 100000 == 0:
                print(rownum)

        with open("/tmp/" + fn, 'w') as f:
            ndjson.dump(
                json_data, f, default=convert_pst_to_utc
            )  #, indent=4) # convert datetime to utc and format as str

        if upload:
            CHUNK_SIZE = 128 * 1024 * 1024  # season to taste
            blob = sumo_bucket.blob("kitsune/" + fn, chunk_size=CHUNK_SIZE)
            blob.upload_from_filename("/tmp/" + fn)

        cursor.close()

    except Error as e:
        print(e)
    finally:
        conn.close()
Exemple #26
0
def rm_trainingdata(old_fn, new_fn):
    with open(old_fn, "r") as rf, open(new_fn, "w") as wf:
        data = ndjson.load(rf)
        for i, d in enumerate(data):
            if data_dic[d['kana']].m_max_in < len(d['events']):
                print(i, d['kana'], "Greater than maximum")
                continue
            if data_dic[d['kana']].m_min_in > len(d['events']):
                print(i, d['kana'], "Less than minimum")
                continue
            if not d['events'][0][0] in data_dic[d['kana']].m_start:
                print(i, d['kana'], "Starting point is different")
                continue
            if not d['events'][-1][0] in data_dic[d['kana']].m_finish:
                print(i, d['kana'], "End point is different")
                continue
            ndjson.dump([d], wf)
            wf.write("\n")
Exemple #27
0
def download_table(
        client: AitoClient,
        table_name: str,
        output_folder: PathLike,
        file_name: str = None,
        batch_size: int = 5000,
        gzip_output: bool = False
):
    """download a table to a NDJSON file or a gzipped NDJSON file

    :param client: the AitoClient instance
    :type client: AitoClient
    :param table_name: the name of the table
    :type table_name: str
    :param output_folder: the folder where the output file is written to
    :type output_folder: PathLike
    :parm file_name: the name of the output file, defaults to None in which the table name is used the file name
    :type file_name: str
    :param batch_size: the number of entries to be downloaded at once, defaults to 5000
    :type batch_size: int
    :param gzip_output: gzip the output file, defaults to False
    :type gzip_output: bool
    """
    if not file_name:
        file_name = table_name
    out_file_path = Path(output_folder) / f'{file_name}.ndjson'
    if out_file_path.exists():
        LOG.warning(f'output file {out_file_path} already exists')
    LOG.debug(f'downloading table `{table_name}` to {out_file_path}')
    table_size = get_table_size(client, table_name)
    begin_idx = 0
    while begin_idx < table_size:
        last_idx = begin_idx + batch_size if begin_idx + batch_size <= table_size else table_size
        LOG.debug(f'downloading table chunk {begin_idx}:{last_idx}...')
        entries_batch = query_entries(client=client, table_name=table_name, offset=begin_idx, limit=batch_size)
        with out_file_path.open('a+') as f:
            ndjson.dump(entries_batch, f)
            if last_idx != table_size:
                f.write('\n')
        LOG.debug(f'downloaded table chunk {begin_idx}:{last_idx}')
        begin_idx += batch_size
    if gzip_output:
        gzip_file(out_file_path, keep=False)
    LOG.info(f'downloaded table `{table_name}` to {out_file_path}')
def write_block_to_file(
        index: defaultdict(list),
        number: int,
) -> int:
    """
    Write defaultdict() to json file

    """
    index.pop("", None)  # remove empty key
    for key in index.keys():
        index[key] = Counter(
            index[key]
        )  # reformat data as doc_id: frequency_keyword_in_doc_id_file
    index = sorted(index.items())  # sort posting lists
    with open(BUILDED_INDEX_PATH + "index{}.json".format(str(number)),
              "w") as file:
        ndjson.dump(index, file)
    number += 1  # store file number , used afterwards for merge
    return number
Exemple #29
0
def download_user_history(api, output_name, screen_name=None, user_id=None, since_id=None, exclude_replies=False,
                          save_retweeters=False):
    res = []
    for page in tweepy.Cursor(api.user_timeline, screen_name=screen_name, user_id=user_id, tweet_mode="extended",
                              since_id=since_id, count=200, exclude_replies=exclude_replies).pages():
        res.extend(page)
    res = [item._json for item in res]

    with open(output_name, "w") as output:
        ndjson.dump(res, output)

    if save_retweeters:
        print("Extracting retweeters")
        users = {}
        for item in res:
            users[item["id"]] = api.retweeters(item["id"])
        with open(output_name + ".retweets.json", "w") as output:
            json.dump(users, output)
        return users
Exemple #30
0
def add_new_tweets_to_dump(output_file: str) -> None:
    try:
        with open(output_file, 'r') as fp:
            existing_tweets = ndjson.load(fp,
                                          object_hook=tweet_json_decode_hook)
    except FileNotFoundError:
        existing_tweets = []

    existing_tweet_ids = set(tweet.id for tweet in existing_tweets)
    newest_tweet_year = existing_tweets[-1].created_at.year if len(
        existing_tweets) > 0 else 2009

    maybe_new_tweets = _get_all_tweets_after_year(newest_tweet_year)
    new_tweets = (tweet for tweet in maybe_new_tweets
                  if tweet.id not in existing_tweet_ids)

    all_tweets = itertools.chain(existing_tweets, new_tweets)
    with open(output_file, 'w') as fp:
        ndjson.dump((encode_tweet_for_json(tweet) for tweet in all_tweets), fp)