def export_serialized(df, column='text', path=None): ''' Serialize column to a dictionary, where keys are ID and values are col. Parameters ---------- df : pd.DataFrame dataframe to unpack column : str (default: 'text') name of df's column to export path : str, optional where to save the resulting .ndjson object ''' # get ID column df_id = (df.reset_index().rename(columns={'index': 'ID'})) # convert data to list of dicts serial_output = [] for i, row in df_id.iterrows(): doc = {'ID': row['ID'], column: row[column]} serial_output.append(doc) # if path is specified, save & be silent if path: with open(path, 'w') as f: ndjson.dump(serial_output, f) return None # if no path, return list of dicts else: return serial_output
def make_dataset( texts_path: Path, num_chars: int, eliminate_words: List[str], out_text: Path, out_char: Path, ): texts: List[str] = texts_path.open(encoding='UTF8').read().split() texts = list( filter(lambda s: all(w not in s for w in eliminate_words), texts)) counter = Counter("".join(texts)) chars = "".join(c[0] for c in counter.most_common(num_chars)) texts = list( filter(lambda s: not contain_unknown_chars(s, chars=chars), texts)) ndjson.dump([{ "str": s } for s in texts], out_text.open('w', encoding='UTF8'), ensure_ascii=False) json.dump([c for c in chars], out_char.open('w', encoding='UTF8'), ensure_ascii=False) # show alphabet for c in "abcdefghijklmnopqrstuvwxyz": if c in chars: print(c, chars.index(c)) else: print(c, "not exist")
def mk_traindata(filename): output_list = [] print("ラベルを入力してenterを押してください。") kana = sys.stdin.read(1) print("開始") c = 0 try: while True: data, command = record() if len(data) == 1: continue else: c += 1 data = list(map(lambda x: [x[0], x[1]], data)) print(data) sys.stdout.write( "{}回目の入力をしました。やめたい時はCtrl+Cを押してください。\n".format(c)) list(map(lambda x: x.insert(1, 'down'), data)) dic = {"kana": kana, "events": data} output_list.append(dic) if c % 10 == 0: with open(os.path.join('../../data/', filename), 'a') as f: ndjson.dump(output_list, f) f.write("\n") print("ここまでの入力を保存しました") output_list = [] except KeyboardInterrupt: with open(os.path.join('../../data/', filename), 'a') as f: ndjson.dump(output_list, f)
def _url_to_ndjson(target_url: str, ndjson_folder: Union[Path, str]): """Fetch json formatted data from a specific CBS table url and write each page as n ndjson file. Parameters ---------- target_url : str The url to fetch from ndjson_folder : str or Path The folder to store all output files Returns ------- list of dicts All entries in url, as a list of dicts Raises ------ FileNotFoundError if no values exist in the url """ logger.debug(f"load_from_url: url = {target_url}") r = requests.get(target_url).json() if r["value"]: # Write as ndjson filename = ( f"page_{int(target_url.split('skip=')[-1])//10000}.ndjson" # TODO: this is built for v3 - v4 datasets inappropriately names the files page_10, page_20, page_30, etc. if "skip" in target_url else "page_0.ndjson") path = Path(ndjson_folder) / Path(filename) with open(path, "w+") as f: ndjson.dump(r["value"], f) return path else: return None
def get_segmented_reviews(retrievepath, savepath): '''This method takes in retrieve path to get the data source json file and dump the data into other file in save path, after doing sentence segmentation on reviews.''' # open the json file and read the reviews in. # The file is actually ndjson(seperated by newlines not commas) try: with open(retrievepath, encoding='latin-1') as f: datastore = ndjson.load(f) except IOError: print('An error occurred trying to read the file.') # using sent_tokenize() to split a review text into a list of sentences. for review in datastore: review['text'] = sent_tokenize(review['text']) # number of sentence in each review text review['num_sentence'] = len(review['text']) # save the sengmented comments to data folder for further analysis try: with open(savepath, 'w+') as f: ndjson.dump(datastore, f) except IOError: print('An error occurred trying to save the file.')
def write_to_ndjson_gz_file(data: List[Dict], output_file: PathLike): output_file = Path(output_file) if not output_file.name.endswith(".ndjson.gz"): raise ValueError("Output file must end with .ndjson.gz") ndjson_file = output_file.parent / output_file.stem with ndjson_file.open('w') as f: ndjson.dump(data, f) gzip_file(ndjson_file, output_file, keep=False)
def write_to_file(file_name: int, index: defaultdict(list)): """ Write index in separated files after merge. """ dict_for_index = sorted( index.items()) # sort posting lists with open(FINAL_INDEX_PATH + "{}.json".format(file_name), 'w') as file: ndjson.dump(dict_for_index, file)
def calculate(doc_top_prob, ID, window: int, out_dir=None, curb_incomplete=False): '''Calculate Novelty, Transience & Resonance on a single window. This function is wrapped in process_windows() - see it for details. ''' # make sure there is a folder to save it if out_dir: if not os.path.exists(out_dir): os.makedirs(out_dir) # signal calculation idmdl = InfoDynamics(data=doc_top_prob, time=ID, window=window, weight=0, sort=False) idmdl.novelty(meas=jsd) idmdl.transience(meas=jsd) idmdl.resonance(meas=jsd) lignes = list() for i, doc_id in enumerate(ID): d = dict() d["doc_id"] = doc_id # HACK because of IndexError try: d["novelty"] = idmdl.nsignal[i] d["transience"] = idmdl.tsignal[i] d["resonance"] = idmdl.rsignal[i] d["nsigma"] = idmdl.nsigma[i] d["tsigma"] = idmdl.tsigma[i] d["rsigma"] = idmdl.rsigma[i] except IndexError: print("[info] there was an Index Error, proceed with caution") pass lignes.append(d) if curb_incomplete: # keep only rows with full records lignes = lignes[window:-window] if out_dir: # make a filename filename = str(window) + 'W' + '.ndjson' outpath = os.path.join(out_dir, filename) # export with open(outpath, "w") as f: ndjson.dump(lignes, f) return None
def parse_dataset(path=RAW_DIR_NAME, decode=None, early_return=True): """ Restructures dataset from '.ndjson' files into folders. Each folder will be of the form 'dataset/{LABEL}' and will contain 1 file per training example. Also saves the list of all filenames to 'filenames.txt'. @param path - str: path to directory containing dataset @param decode - None or "jpg" - how to decode training examples @param early_return - bool: indicates whether method should return early if 'filenames.txt' already exists @returns list containing all the filenames of the training examples (relative to path) @returns list containing all the labels of the dataset """ list_ids = [] labels = set() # If the filenames.txt file already exists, parse the file to find # list_ids and labels, and return early if decode == 'jpg': list_ids_filename = os.path.join(path, '../img/' + 'filenames.txt') else: list_ids_filename = os.path.join(path, 'filenames.txt') if early_return and os.path.exists(list_ids_filename): with open(list_ids_filename) as f: list_ids = ndjson.load(f) for list_id in list_ids: label = os.path.basename(os.path.dirname(list_id)) labels.add(label) return list_ids, list(labels) # Loop through all '.ndjson' files and split into individual files pool = mp.Pool(mp.cpu_count()) files = os.listdir(path) files = [f for f in files if os.path.splitext(f)[1] == '.ndjson'] list_ids_temp = [] parse = functools.partial(parse_label, path=path, decode=decode) pool.map_async(parse, files, callback=list_ids_temp.extend) pool.close() pool.join() # Convert list_ids_temp from list of lists to just a list list_ids = [] for list_id in list_ids_temp: list_ids += list_id # Write output to 'dataset/filename.txt' and find all labels with open(list_ids_filename, 'w') as f: ndjson.dump(list_ids, f) for list_id in list_ids: label = os.path.basename(os.path.dirname(list_id)) labels.add(label) return list_ids, list(labels)
def run_eval(opt=None, model=None, loader=None, dataset='test', write_result=False): if opt is None: opt = TestOptions().parse() if model is None: model = SketchModel(opt) if loader is None: loader = load_data(opt, datasetType=dataset, permutation=opt.permutation) # print(len(loader)) if opt.eval_way == 'align': predictList, lossList = eval_align_batchN(model, loader, P=opt.points_num) elif opt.eval_way == 'unalign': predictList, lossList = eval_unalign_batch1(model, loader) else: raise NotImplementedError('eval_way {} not implemented!'.format( opt.eval_way)) # print(predictList.shape) testData = [] with open( os.path.join('data', opt.dataset, 'train', '{}_{}.ndjson'.format(opt.class_name, dataset)), 'r') as f: testData = ndjson.load(f) if opt.metric_way == 'wlen': p_metric_list, c_metric_list = eval_with_len(testData, predictList) elif opt.metric_way == 'wolen': p_metric_list, c_metric_list = eval_without_len(testData, predictList) else: raise NotImplementedError('metric_way {} not implemented!'.format( opt.metric_way)) if write_result: testData = get_eval_result(testData, predictList) result_path = os.path.join( 'data', opt.dataset, 'train', '{}_{}.ndjson'.format(opt.class_name, 'res')) with open(result_path, 'w') as f: ndjson.dump(testData, f) loss_avg = np.average(lossList) P_metric = np.average(p_metric_list) C_metric = np.average(c_metric_list) # print('P_metric:{:.4}%\tC_metric:{:.4}%'.format(P_metric*100, C_metric*100)) return loss_avg, P_metric, C_metric
def writer(file_res, fname, outdir): # check for outdir if not os.path.exists(outdir): os.makedirs(outdir) # file extension out_path = os.path.join(outdir, fname + '.ndjson') # export with open(out_path, 'w') as fout: ndjson.dump(file_res, fout) return None
def generate(filename, lang): with open(filename) as i: items = ndjson.load(i) translated_items = generate_synthetic_data(items, lang) basename = os.path.basename(filename) basename_without_extension = os.path.splitext(basename)[0] output_name = "{}_{}.ndjson".format(basename_without_extension, lang) with open(output_name, "w") as o: ndjson.dump(translated_items, o)
def agregando(): try: key, value = word.split('->') if key in agregar: tempDictionary[key] = value except: key, value1, value2 = word.split('->') # if key in acceptedFields: tempDictionary[key] = value1 + value2 #EXTRACT #with codecs.open(rawPath, 'r', encoding='utf-8', errors='ignore') as rawFile: # lines = rawFile.readlines() # tempDictionary = {} # total = 0 #CLEAN # for line in lines: # if 'BREAK-REVIEWED' in line: # tempDictionary = {} # elif re.search('[0-9]+\.[0-9]+', line): # # tempDictionary['price'] = float(re.findall( # '[0-9]+\.[0-9]+', line)[0]) # else: # #Cleaning # cleanLine = line.split() # cleanLine = " ".join(cleanLine) # cleanLine = cleanLine.replace('<->', ' - ') # cleanLine = cleanLine.replace('ice>Link Plus -> ', '') # cleanLine = cleanLine.replace('\"', '') # cleanLine = cleanLine.replace('.', '') # cleanLine = cleanLine.replace("'", "") # soup = BeautifulSoup(cleanLine, "html.parser") # cleanLine = soup.get_text() # # if re.search('[0-9]+["] ', cleanLine): # # cleanLine = cleanLine.replace('"','') # cleanLine = cleanLine.lower() # #Find KeyValues # words = re.findall('\S+->.*?(?= \S+->|$)', cleanLine) # # #Get Data # for word in words: # # agregando(); # # if(set(agregar).issubset(tempDictionary)): # productList.append(tempDictionary.copy()) #GUARDAR LOS DATOS LIMPIOS EN UN JSON with open('backup.json', 'w') as f: ndjson.dump(productList, f, sort_keys=True, cls=DecimalEncoder)
def fetch_kibana_object(obj_type, exportpath): try: print('# Fetching kibana objects: %s' % obj_type) response = requests.post(KIBANA_OBJECTS_EXPORT_URL, json={'type': obj_type}, verify=False, auth=(KIBANA_USER, KIBANA_PASS), headers={'kbn-xsrf': 'true'}) if response.status_code != 200: print('!!! Error fetching kibana object %s: HTTP status code %s' % (obj_type, response.status_code)) else: rawData = response.text.encode('utf-8') items = ndjson.loads(rawData) if obj_type != 'index-pattern': toExport = [] for ip in items: if 'attributes' in ip.keys( ) and 'title' in ip['attributes']: if re.match(REDELK_OBJ_FILTER, ip['attributes']['title'], re.IGNORECASE): ip.pop('updated_at', None) ip['version'] = '1' toExport.append(ip) export_file = os.path.join( exportpath, '%s%s.ndjson' % (EXPORT_FILES_PREFIX_KIBANA, obj_type)) print('\tExporting %s: %s' % (obj_type, export_file)) with open(export_file, 'w') as f: ndjson.dump(toExport, f) else: for ip in items: if 'attributes' in ip.keys( ) and 'title' in ip['attributes']: if re.match(INDEX_PATTERNS_FILTER, ip['attributes']['title'], re.IGNORECASE): # print('%s: %s' % (obj_type,ip['attributes']['title'])) pn = ip['attributes']['title'][:-2] if ip[ 'attributes']['title'].endswith( '-*') else ip['attributes']['title'] ip.pop('updated_at', None) ip['version'] = '1' export_file = os.path.join( exportpath, '%s%s_%s.ndjson' % (EXPORT_FILES_PREFIX_KIBANA, obj_type, pn)) print('\tExporting %s: %s' % (obj_type, export_file)) with open(export_file, 'w') as f: ndjson.dump([ip], f) except Exception as e: print('!!! Error fetching kibana object %s: %s' % (obj_type, e))
def generate_nd_json(*args, **kwargs): item_number = 10 users = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] events = ['tutorial', 'iap'] items = [{ 'user': random.choice(users), 'timestamp': str(random_timestamp.random_timestamp(2020)), 'evtname': random.choice(events), 'spend': round(random.random(), 2) } for i in range(item_number)] # dump to file-like objects with open( os.path.join('/home/airflow/gcs', 'data', 'test', f"{kwargs['execution_date']}.ndjson"), 'w') as f: ndjson.dump(items, f)
def test_validate_ndjson_uuid(tmp_path, project): file_name = f"repeat_uuid.ndjson" file_path = tmp_path / file_name repeat_uuid = PREDICTIONS.copy() repeat_uuid[0]['uuid'] = 'test_uuid' repeat_uuid[1]['uuid'] = 'test_uuid' with file_path.open("w") as f: ndjson.dump(repeat_uuid, f) with pytest.raises(UuidError): project.upload_annotations(name="name", annotations=str(file_path)) with pytest.raises(UuidError): project.upload_annotations(name="name", annotations=repeat_uuid)
def get_ab_test_information(deviceType): # Select authentication keys for App if deviceType.upper() == 'IOS': APP_ID = IOS_APP_ID_KEY EXPORT_KEY = IOS_EXPORT_KEY CONTENT_KEY = IOS_CONTENT_READ_ONLY_KEY elif deviceType.upper() == 'ANDROID': APP_ID = ANDROID_APP_ID_KEY EXPORT_KEY = ANDROID_EXPORT_KEY CONTENT_KEY = ANDROID_CONTENT_READ_ONLY_KEY else: #TODO: How do you make sure that if this happens, airflow knows it was a failed job? logging.ERROR(f'{job_name} - get_ab_test_information: Device not one of IOS or Android. Stopping Application') return 'Device not one of IOS or Android' # Construct abTests URL getABTests_url = getABTests_api+f'''&appId={APP_ID}&clientKey={CONTENT_KEY}&apiVersion=1.0.6''' # Get AB Tests Information from API logging.info(f'''{job_name} - get_ab_test_information: Requesting list of A/B Tests from API for {deviceType}''') http = urllib3.PoolManager( cert_reqs='CERT_REQUIRED', ca_certs=certifi.where() ) response = http.request('GET', getABTests_url) response_return = response.data.decode('utf-8') response_json = json.loads(response_return) # Check response for successful data pull logging.info(f'''{job_name} - get_ab_test_information: Checking response for successful data retrieval''') if response_json['response'][0]['success'] == True and response.status == 200: logging.info(f'''{job_name} - get_ab_test_information: Data successfully retrieved''') data = response_json['response'][0] json_file = '/Users/gkaberere/spark-warehouse/leanPlum/saved_json_ndjson.json' with open(json_file, 'w') as file: ndjson.dump(data, file) else: #TODO: How do you make sure that if this happens airflow knows it's a failed job logging.error(f'''{job_name} - get_ab_test_information: Response did not meet success = True or have a response status = 200''') return return
def test_create_from_local_file(tmp_path, project): name = str(uuid.uuid4()) file_name = f"{name}.ndjson" file_path = tmp_path / file_name with file_path.open("w") as f: ndjson.dump(PREDICTIONS, f) bulk_import_request = project.upload_annotations( name=name, annotations=str(file_path)) assert bulk_import_request.project() == project assert bulk_import_request.name == name assert bulk_import_request.error_file_url is None assert bulk_import_request.status_file_url is None assert bulk_import_request.state == BulkImportRequestState.RUNNING __assert_file_content(bulk_import_request.input_file_url)
def test_create_from_local_file(tmp_path, predictions, configured_project): name = str(uuid.uuid4()) file_name = f"{name}.ndjson" file_path = tmp_path / file_name with file_path.open("w") as f: ndjson.dump(predictions, f) bulk_import_request = configured_project.upload_annotations( name=name, annotations=str(file_path), validate=False) assert bulk_import_request.project() == configured_project assert bulk_import_request.name == name assert bulk_import_request.error_file_url is None assert bulk_import_request.status_file_url is None assert bulk_import_request.state == BulkImportRequestState.RUNNING assert_file_content(bulk_import_request.input_file_url, predictions)
def write_dict_to_file( file_number: int, dict_for_index: defaultdict(list)) -> int: """ This function write parts of index from texts blocks in tmp index file. First stage of creating index with SPIMI. """ for key in dict_for_index.keys(): term_freqs = Counter(dict_for_index[key]) # count terms freqs in docs dict_for_index[key] = term_freqs dict_for_index: dict = sorted(dict_for_index.items()) with open("data/index_blocks/index_file{}.txt".format(str(file_number)), "w") as f: ndjson.dump(dict_for_index, f) # write as json for comfortable reading lines file_number += 1 # this var is for naming tmp index files return file_number
def save_training_example(drawing, path, decode=None): """ Saves a single training example to the directory of specified path. The filename will be set to the key_id. @param drawing - dict: raw data from the Quick! Draw dataset with keys 'word', 'key_id', and 'drawing' @param decode (None or "jpg"): whether to decode sketches as images. By default, sketches are saved as ndjson files. @param path - str: folder where training examples will be stored. @returns str - the filename where the training example is saved. """ ext = '.jpg' if decode == 'jpg' else '.ndjson' filename = os.path.join(path, drawing['key_id'] + ext) if not os.path.exists(filename): drawing_simplified = [{ 'word': drawing['word'], 'key_id': drawing['key_id'], 'drawing': drawing['drawing'] }] if decode == 'jpg': drawing_decoded = decode_drawing(drawing['drawing']) cv2.imwrite(filename, drawing_decoded) else: with open(filename, mode='w') as f: writer = ndjson.dump(drawing_simplified, f) result = os.path.join(drawing['word'], drawing['key_id'] + ext) # Return only the label with the key_id for sake of space. return result
def concat_texts_timebins(lemma_path, metadata_path, outdir, timebin='10Min'): ''' ''' # load files with open(lemma_path) as fin: file_lemma = ndjson.load(fin) with open(metadata_path) as fin: file_meta = ndjson.load(fin) timestamps = [doc['start'] for doc in file_meta] del file_meta # resample df_resampled = (pd.DataFrame( file_lemma, index=pd.to_timedelta(timestamps)).resample(timebin).sum()) # get rid of 0 (no document in time bin) df_resampled = (df_resampled.replace(0, np.nan).dropna()) # get rid of [] (there is a document but no features in time bin) df_resampled['text'] = df_resampled.text[df_resampled.text.apply(len) > 0] df_resampled = (df_resampled.dropna().reset_index()) # get timestamp as str df_resampled['time'] = df_resampled['index'].astype(str).str.extract( 'days (.*?)\.') # serialize file_res = [] for i, row in df_resampled.iterrows(): res = dict() res.update({ 'time': row.time, 'text': row.text, 'lemma': row.lemma, 'pos': row.pos, 'dep': row.dep, 'ner': row.ner }) file_res.append(res) outfname = os.path.basename(lemma_path) with open(os.path.join(outdir, outfname), 'w') as fout: ndjson.dump(file_res, fout) return None
def test_validate_ndjson_uuid(tmp_path, configured_project, predictions): file_name = f"repeat_uuid.ndjson" file_path = tmp_path / file_name repeat_uuid = predictions.copy() repeat_uuid[0]['uuid'] = 'test_uuid' repeat_uuid[1]['uuid'] = 'test_uuid' with file_path.open("w") as f: ndjson.dump(repeat_uuid, f) with pytest.raises(MALValidationError): configured_project.upload_annotations(name="name", annotations=str(file_path)) with pytest.raises(MALValidationError): configured_project.upload_annotations(name="name", annotations=repeat_uuid)
def run_standard(paths, outdir): ''' Alternative: forbidden_pos=['PUNCT', 'CONJ', 'CCONJ', 'DET', 'SYM'] ''' # iterate through files for path in tqdm(paths): # get file name fname = os.path.basename(path) outpath = os.path.join(outdir, fname) # open file with open(path) as f: session = ndjson.load(f) # extend stopwords # stopwords = get_first_name_set() stopwords = set() stopwords.update([ # 'marisha', # 'liam', # 'travis', # 'keyleth', # 'ane', # 'taliesin', # 'hav', # 'orion', '-pron-', "'s", 'yeah', 'guy' ]) # process all docs in that file features = extract_features( session, tokentype='lemma', forbidden_pos=['PUNCT', 'CONJ', 'CCONJ', 'DET', 'SYM'], langs='en', # filter out names extended_stopwords=stopwords) # export with open(outpath, 'w') as f: ndjson.dump(features, f)
def fetch_data_json(fn, query, fields, format_fn, upload): try: conn = mysql.connector.connect( host=os.environ.get("SUMO_MYSQL_HOST", "localhost"), port=os.environ.get("SUMO_MYSQL_PORT", 3306), database=os.environ.get("SUMO_MYSQL_DB_NAME", "kitsune"), user=os.environ.get("SUMO_MYSQL_USERNAME", "root"), password=os.environ.get("SUMO_MYSQL_PASSWORD", "")) # conn = mysql.connector.connect(host='127.0.0.1', port=3306, # database='kitsune', # user='******') # #password='******') if conn.is_connected(): print('Connected to MySQL database') cursor = conn.cursor() cursor.execute(query) row_headers = [x[0] for x in cursor.description ] #this will extract row headers print(row_headers) rows = cursor.fetchall() json_data = [] rownum = 0 for row in rows: rownum = rownum + 1 json_data.append(dict(zip(row_headers, row))) if rownum % 100000 == 0: print(rownum) with open("/tmp/" + fn, 'w') as f: ndjson.dump( json_data, f, default=convert_pst_to_utc ) #, indent=4) # convert datetime to utc and format as str if upload: CHUNK_SIZE = 128 * 1024 * 1024 # season to taste blob = sumo_bucket.blob("kitsune/" + fn, chunk_size=CHUNK_SIZE) blob.upload_from_filename("/tmp/" + fn) cursor.close() except Error as e: print(e) finally: conn.close()
def rm_trainingdata(old_fn, new_fn): with open(old_fn, "r") as rf, open(new_fn, "w") as wf: data = ndjson.load(rf) for i, d in enumerate(data): if data_dic[d['kana']].m_max_in < len(d['events']): print(i, d['kana'], "Greater than maximum") continue if data_dic[d['kana']].m_min_in > len(d['events']): print(i, d['kana'], "Less than minimum") continue if not d['events'][0][0] in data_dic[d['kana']].m_start: print(i, d['kana'], "Starting point is different") continue if not d['events'][-1][0] in data_dic[d['kana']].m_finish: print(i, d['kana'], "End point is different") continue ndjson.dump([d], wf) wf.write("\n")
def download_table( client: AitoClient, table_name: str, output_folder: PathLike, file_name: str = None, batch_size: int = 5000, gzip_output: bool = False ): """download a table to a NDJSON file or a gzipped NDJSON file :param client: the AitoClient instance :type client: AitoClient :param table_name: the name of the table :type table_name: str :param output_folder: the folder where the output file is written to :type output_folder: PathLike :parm file_name: the name of the output file, defaults to None in which the table name is used the file name :type file_name: str :param batch_size: the number of entries to be downloaded at once, defaults to 5000 :type batch_size: int :param gzip_output: gzip the output file, defaults to False :type gzip_output: bool """ if not file_name: file_name = table_name out_file_path = Path(output_folder) / f'{file_name}.ndjson' if out_file_path.exists(): LOG.warning(f'output file {out_file_path} already exists') LOG.debug(f'downloading table `{table_name}` to {out_file_path}') table_size = get_table_size(client, table_name) begin_idx = 0 while begin_idx < table_size: last_idx = begin_idx + batch_size if begin_idx + batch_size <= table_size else table_size LOG.debug(f'downloading table chunk {begin_idx}:{last_idx}...') entries_batch = query_entries(client=client, table_name=table_name, offset=begin_idx, limit=batch_size) with out_file_path.open('a+') as f: ndjson.dump(entries_batch, f) if last_idx != table_size: f.write('\n') LOG.debug(f'downloaded table chunk {begin_idx}:{last_idx}') begin_idx += batch_size if gzip_output: gzip_file(out_file_path, keep=False) LOG.info(f'downloaded table `{table_name}` to {out_file_path}')
def write_block_to_file( index: defaultdict(list), number: int, ) -> int: """ Write defaultdict() to json file """ index.pop("", None) # remove empty key for key in index.keys(): index[key] = Counter( index[key] ) # reformat data as doc_id: frequency_keyword_in_doc_id_file index = sorted(index.items()) # sort posting lists with open(BUILDED_INDEX_PATH + "index{}.json".format(str(number)), "w") as file: ndjson.dump(index, file) number += 1 # store file number , used afterwards for merge return number
def download_user_history(api, output_name, screen_name=None, user_id=None, since_id=None, exclude_replies=False, save_retweeters=False): res = [] for page in tweepy.Cursor(api.user_timeline, screen_name=screen_name, user_id=user_id, tweet_mode="extended", since_id=since_id, count=200, exclude_replies=exclude_replies).pages(): res.extend(page) res = [item._json for item in res] with open(output_name, "w") as output: ndjson.dump(res, output) if save_retweeters: print("Extracting retweeters") users = {} for item in res: users[item["id"]] = api.retweeters(item["id"]) with open(output_name + ".retweets.json", "w") as output: json.dump(users, output) return users
def add_new_tweets_to_dump(output_file: str) -> None: try: with open(output_file, 'r') as fp: existing_tweets = ndjson.load(fp, object_hook=tweet_json_decode_hook) except FileNotFoundError: existing_tweets = [] existing_tweet_ids = set(tweet.id for tweet in existing_tweets) newest_tweet_year = existing_tweets[-1].created_at.year if len( existing_tweets) > 0 else 2009 maybe_new_tweets = _get_all_tweets_after_year(newest_tweet_year) new_tweets = (tweet for tweet in maybe_new_tweets if tweet.id not in existing_tweet_ids) all_tweets = itertools.chain(existing_tweets, new_tweets) with open(output_file, 'w') as fp: ndjson.dump((encode_tweet_for_json(tweet) for tweet in all_tweets), fp)