def load_pickle(filename, filepath): """ Loads pickle file from files. :param filename: :param filepath: :return: """ # logger.debug("Method: load_pickle(pkl_file)") logger.info("Reading from pickle file: [{}]".format( join(filepath, filename + ".pkl"))) if exists(join(filepath, filename + ".pkl")): try: logger.info("Reading pickle file: [{}]".format( join(filepath, filename + ".pkl"))) with sopen(join(filepath, filename + ".pkl"), 'rb') as pkl_file: loaded = pk.load(pkl_file) return loaded except Exception as e: logger.warning("Could not open file: [{}]".format( join(filepath, filename + ".pkl"))) logger.warning("Failure reason: [{}]".format(e)) return False else: logger.warning("File not found at: [{}]".format( join(filepath, filename + ".pkl")))
def read_titles(self, classes_keys=None, title_path=None, title_file="titles.txt", encoding=config["text_process"]["encoding"]): """ Reads the titles.txt file and returns a OrderedDict of id : title. :param classes_keys: List of sample2cats keys to check only those keys are stored. :param title_file: :param title_path: :param encoding: :return: """ logger.info( "Reads the titles.txt file and returns a OrderedDict of id : title." ) titles = OrderedDict() if title_path is None: title_path = join(self.raw_txt_dir, title_file) with sopen(title_path, encoding=encoding) as raw_title_ptr: for cnt, line in enumerate(raw_title_ptr): line = line.split() if classes_keys is None or line[ 0] in classes_keys: # Add this sample if corresponding sample2cats exists. titles[line[0].strip()] = " ".join(line[1:]).strip() return titles
def save_pickle(data, filename, filepath, overwrite=False): """ Saves python object as pickle file. :param data: :param filename: :param filepath: :param overwrite: :return: """ # logger.debug("Method: save_pickle(data, filename, filepath, overwrite=False)") logger.info("Writing to pickle file: [{}]".format( join(filepath, filename + ".pkl"))) if not overwrite and exists(join(filepath, filename + ".pkl")): logger.warning( "File [{}] already exists and Overwrite == False.".format( join(filepath, filename + ".pkl"))) return True try: if isfile(join(filepath, filename + ".pkl")): logger.info("Overwriting on pickle file: [{}]".format( join(filepath, filename + ".pkl"))) with sopen(join(filepath, filename + ".pkl"), 'wb') as pkl_file: pk.dump(data, pkl_file) pkl_file.close() return True except Exception as e: logger.warning("Could not write to pickle file: [{}]".format( join(filepath, filename + ".pkl"))) logger.warning("Failure reason: [{}]".format(e)) return False
def write_file(data, filename, filepath='', overwrite=False, mode='w', encoding="utf-8", date_time_tag='', verbose=False): """ :param verbose: :param encoding: :param data: :param filename: :param filepath: :param overwrite: :param mode: :param date_time_tag: :return: """ if not overwrite and exists( join(filepath, date_time_tag + filename + ".txt")): # logger.warning("File [{}] already exists and Overwrite == False.".format( # join(filepath, date_time_tag + filename + ".txt"))) return True with sopen(join(filepath, date_time_tag + filename + ".txt"), mode, encoding=encoding) as text_file: if verbose: logger.info("Saving text file: [{}]".format( join(filepath, date_time_tag + filename + ".txt"))) text_file.write(str(data)) text_file.write("\n") text_file.write("\n") text_file.close()
def loadPostgres(): # Load to postgres csv_files = get_s3_keys(bucket, prefix, 'csv') csv_path = "s3://%s/%s" % (bucket, csv_files[0]) print(csv_path) conn_string = "host=%s dbname=%s user=%s password=%s" % ( postgres_server, dbname, dbuser, password) conn = psycopg2.connect(conn_string) cur = conn.cursor() cur.execute("""CREATE TABLE IF NOT EXISTS top1000( title text, budget float8, year integer, revenue float8, popularity float8, ratio float8, companiesList text, url text, abstract text ) """) cur.execute("""DELETE FROM top1000""") conn.commit() with sopen(csv_path, 'r') as f: cur.copy_from(f, 'top1000', sep='\t') conn.commit()
def uploadFile(req): with sopen("gs://garage-labs/chinese-whisperer/latest.webm", "wb") as fp: content = req.files.get('file').read() fp.write(content) response = flask.jsonify({'some': 'data'}) response.headers.add('Access-Control-Allow-Origin', '*') return response
def transfer(file, mpx_id): url = "http://{0}{1}".format(ns_host, file) bucket = destination(file) try: with sopen(url, 'rb', 1024*500, transport_params=dict(headers=auth(file))) as fin: with sopen(bucket, 'wb', transport_params=dict(session=session)) as fout: while True: buffer = fin.read(1024*2) if not buffer: fin.close() break else: fout.write(buffer) cleanup(file, mpx_id) except Exception as e: manage_threads(file, mpx_id) logger(e, 'threads', 'error') logger("NOT TRANSFERED - {0}".format(file), 'info', 'error')
def contains(inputFile, outputFile, search, regex): with open(inputFile, 'r') as input, sopen(outputFile) as output: text = input.readlines() for line in text: if not line: break if regex and re.search(search, line) != None: output.write(line) elif not regex and search in line: output.write(line)
def duplicatedLines(inputFile, outputFile, withoutBlankLines): with open(inputFile, 'r') as input, sopen(outputFile) as output: text = input.readlines() linesSeen = set() for line in text: if line not in linesSeen: output.write(line) if withoutBlankLines and line.strip()=='': pass else: linesSeen.add(line)
def load_json(filename: str, filepath: str = '', date_time_tag: str = '', ext: str = ".json", show_path: bool = False) -> OrderedDict: """ Loads json file as python OrderedDict. :param show_path: :param ext: Should extension be appended? :param filename: :param filepath: :param date_time_tag: :return: OrderedDict """ file_loc = join(filepath, date_time_tag + filename + ext) if show_path: logger.info("Reading JSON file: [{}]".format(file_loc)) if exists(join(filepath, date_time_tag + filename + ext)): try: with sopen(file_loc, encoding="utf-8") as file: json_dict = json.load(file) json_dict = OrderedDict(json_dict) # json_dict = OrderedDict(json.load(file)) file.close() return json_dict except Exception as e: logger.warning( "Could not open file as JSON: [{}]. \n Reason:[{}]".format( file_loc, e)) with sopen(file_loc, encoding="utf-8") as file: json_dict = str(file) json_dict = json.loads(json_dict) # json_dict = OrderedDict(json_dict) return json_dict else: logger.warning("File does not exist at: [{}]".format(file_loc)) return False
def _sopen(self, key, *args, **kwargs): creds = f"{self.aws_access_key_id}:{self.aws_secret_access_key}" server_port = "" if self.endpoint_url: parsed_url = urlparse(self.endpoint_url) if parsed_url.netloc: server_port = parsed_url.netloc else: server = parsed_url.path.split("/")[0] server_port = f"{server}:80" bucket_path = f"{self._bucket_name}/{key}" string = "s3://" + "@".join( i for i in [creds, server_port, bucket_path] if i) return sopen(string, *args, **kwargs)
def contains(inputFile, outputFile, search, regex): total = 0 searched = 0 with open(inputFile, 'r') as input, sopen(outputFile) as output: text = input.readlines() for line in text: total += 1 if not line: break if regex and re.search(search, line) != None: output.write(line) searched+=1 elif not regex and search in line: output.write(line) searched+=1 log.info('Total : %dsentences.' % total) log.info('Searched : %dsentences.(%d%%)' % ( searched, 100*(float(searched)/float(total)) ) )
def save_json(data, filename, filepath='', overwrite=False, indent=2, date_time_tag=''): """ :param data: :param filename: :param filepath: :param overwrite: :param indent: :param date_time_tag: :return: """ logger.info("Saving JSON file: [{}]".format( join(filepath, date_time_tag + filename + ".json"))) if not overwrite and exists( join(filepath, date_time_tag + filename + ".json")): logger.warning( "File [{}] already exists and Overwrite == False.".format( join(filepath, date_time_tag + filename + ".json"))) return True try: with sopen(join(filepath, date_time_tag + filename + ".json"), 'w') as json_file: try: json_file.write(json.dumps(data, indent=indent)) except Exception as e: logger.warning("Writing JSON failed: [{}]".format(e)) logger.warning("Writing as string: [{}]".format( join(filepath, date_time_tag + filename + ".json"))) json_file.write(json.dumps(str(data), indent=indent)) return True json_file.close() return True except Exception as e: logger.warning("Writing JSON file [{}] failed: [{}]".format( join(filepath, filename), e)) logger.warning("Writing as TXT: [{}]".format(filename + ".txt")) write_file(data, filename, date_time_tag=date_time_tag) return False
def trim(inputFile, outputFile, lines): total = 0 notTrimmed = 0 with open(inputFile, 'r') as input, sopen(outputFile) as output: text = input.readlines() for line in text: total += 1 if not line: break if lines: #trim blank lines if not re.search('^$', line): output.write(line) notTrimmed+=1 else: stripline = line.strip() output.write(stripline+'\n') if stripline == line[:-1]: notTrimmed+=1 log.info('Total : %dsentences.' % total) log.info('Trimmed : %dsentences.(%d%%)' % (total-notTrimmed, 100*(float(total-notTrimmed)/float(total))))
def read_green_taxi_csv(url, fobj): """ Read a "green taxi" CSV file from the New York City Taxi and Limousine Commission (TLC) trip dataset, clean the data and write the data into the provided binary file object. Note: tested against only one specific file: https://nyc-tlc.s3.us-east-1.amazonaws.com/trip%20data/green_tripdata_2013-09.csv """ # smart-open makes it easy to open a file via HTTP(S), S3, GCS, # local etc URLs. with sopen(url, mode='rb') as fobj_src: # The first line should be the header. Validate that it's what # we expect. line = fobj_src.readline() if line.rstrip().decode(ENCODING).split(',') != HEADER: raise InvalidHeaderError(line) # Ignore any whitespace-only lines between the header and data. # Return if we encounter the end of the file. while True: offset = fobj_src.tell() line = fobj_src.readline() if line.rstrip(): fobj_src.seek(offset) break if not line: return # Ensure that there are at least 20 fields and preserve only # these fields via regexp. The data has an odd structure in that # there are additional trailing empty fields, which we ignore. for line in fobj_src: match = PATTERN_DATA.match(line) if not match: raise InvalidDataError(line) fobj.write(match.group(1)) fobj.write(b'\n')
def read_classes(self, classes_dir=None, classes_file="cats.txt", encoding=config["text_process"]["encoding"]): """ Reads the cats.txt file and returns a OrderedDict of id : class ids. :param classes_file: :param classes_dir: :param encoding: :return: """ logger.info( "Reads the cats.txt file and returns a OrderedDict of id : class ids." ) cat_line_phrase = " " # Phrase to recognize lines with category information. cat_sep_phrase = ", " # Phrase to separate cats. classes = OrderedDict() cat_pool = set() if classes_dir is None: classes_dir = self.raw_txt_dir with sopen(join(classes_dir, classes_file), encoding=encoding) as raw_cat_ptr: sample_idx = raw_cat_ptr.readline().strip() for cnt, line in enumerate(raw_cat_ptr): if cat_line_phrase in line: cats = line.split( cat_sep_phrase ) # Splliting line based on ', ' to get cats. cats = [x.strip() for x in cats ] # Removing extra characters like: ' ','\n'. cat_pool.update(cats) else: classes[sample_idx] = list(cat_pool) cat_pool.clear() sample_idx = line.strip() return classes
def read_desc(self, classes_keys=None, desc_path=None, desc_file="descriptions.txt", encoding=config["text_process"]["encoding"]): """ Reads the descriptions.txt file and returns a OrderedDict of id : desc. :param classes_keys: :param desc_file: :param desc_path: :param encoding: :return: """ id_phrase = "product/productId: " # Phrase to recognize lines with sample id. id_remove = 19 # Length of [id_phrase], to be removed from line. desc_phrase = "product/description: " # Phrase to recognize lines with sample description. desc_remove = 21 # Length of [desc_phrase], to be removed from line. logger.info( "Reads the descriptions.txt file and returns a OrderedDict of id : desc." ) descriptions = OrderedDict() if desc_path is None: desc_path = join(self.raw_txt_dir, desc_file) import itertools with sopen(desc_path, encoding=encoding) as raw_desc_ptr: for idx_line, desc_line in itertools.zip_longest( *[raw_desc_ptr] * 2): # Reads multi-line [2] per iteration. if id_phrase in idx_line: sample_id = idx_line[id_remove:].strip() if classes_keys is None or sample_id in classes_keys: # Add this sample if corresponding class exists. if desc_phrase in desc_line: sample_desc = desc_line[desc_remove:].strip() else: sample_desc = None # Even if 'description' is not found, we are not ignoring the sample as it might still have text in 'title'. descriptions[sample_id] = sample_desc return descriptions
def read(self, url_or_path): return sopen(url_or_path, mode='rb', transport_params=self.auth_client())
def gen_dicts(self, json_path=None, encoding=config["text_process"]["encoding"], specials="""_-@*#'"/\\""", replace=' '): """ Generates the data dictionaries from original json file. :param replace: Character to replace with. :param specials: Characters to clean from txts. :param json_path: Path to raw json file. :param encoding: Encoding for the raw json file. :return: txts, sample2cats, cattext2catid_map, no_cat_ids no_cat_ids: ids for which no categories were found. """ import ast # As the data is not proper JSON (single-quote instead of double-quote) format, "json" library will not work. from unidecode import unidecode logger.info("Generates the data dictionaries from original json file.") txts = OrderedDict() classes = OrderedDict() cats = OrderedDict() no_cat_ids = [] # To store ids for which no cats were found. if json_path is None: json_path = self.raw_json_dir with sopen(json_path, encoding=encoding) as raw_json_ptr: trans_table = File_Util.make_trans_table( specials=specials, replace=replace) # Creating mapping to clean txts. cat_idx = 0 # Holds the category index. for cnt, line in enumerate(raw_json_ptr): # Instead of: line_dict = OrderedDict(json.loads(line)); # Use: import ast; line_dict = ast.literal_eval(line.strip().replace('\n','\\n')); line_dict = ast.literal_eval(line.strip().replace('\n', '\\n')) if "categories" in line_dict: # Check if "cats" exists. if "title" in line_dict: # Check if "title" exists, add if True. txts[line_dict["asin"]] = unidecode( str(line_dict["title"])).translate(trans_table) if "description" in line_dict: # Check if "description" exists and append to "title" with keyword: ". \nDESC: ", if true. txts[line_dict["asin"]] = txts[ line_dict["asin"]] + ". \nDESC: " + unidecode( str(line_dict["description"])).translate( trans_table) else: if "description" in line_dict: # Check if "description" exists even though "title" does not, use only "description" if true. txts[line_dict["asin"]] = ". \nDESC: " + line_dict[ "description"] else: # Report and skip the sample if neither "title" nor "description" exists. logger.warning( "Neither 'title' nor 'description' found for sample id: [{}]. Adding sample to 'no_cat_ids'." .format(line_dict["asin"])) no_cat_ids.append( line_dict["asin"] ) # As neither "title" nor "description" exists, adding the id to "no_cat_ids". continue classes[line_dict["asin"]] = line_dict["cats"][0] for lbl in classes[line_dict["asin"]]: if lbl not in cats: # If lbl does not exists in cats already, add it and assign a new category index. cats[lbl] = cat_idx cat_idx += 1 classes[line_dict["asin"]][classes[ line_dict["asin"]].index(lbl)] = cats[ lbl] # Replacing cats text to cats id. else: # if "categories" does not exist, then add the id to "no_cat_ids". no_cat_ids.append(line_dict["asin"]) File_Util.save_json(no_cat_ids, self.dataset_name + "_no_cat_ids", filepath=self.dataset_dir) logger.info( "Number of txts: [{}], sample2cats: [{}] and cattext2catid_map: [{}]." .format(len(txts), len(classes), len(cats))) return txts, classes, cats
def write(self, url_or_path): return sopen(url_or_path, mode='wb', transport_params=self.auth_client())