コード例 #1
0
ファイル: File_Util.py プロジェクト: SamujjwalSam/XC_GCN
def load_pickle(filename, filepath):
    """
    Loads pickle file from files.

    :param filename:
    :param filepath:
    :return:
    """
    # logger.debug("Method: load_pickle(pkl_file)")
    logger.info("Reading from pickle file: [{}]".format(
        join(filepath, filename + ".pkl")))
    if exists(join(filepath, filename + ".pkl")):
        try:
            logger.info("Reading pickle file: [{}]".format(
                join(filepath, filename + ".pkl")))
            with sopen(join(filepath, filename + ".pkl"), 'rb') as pkl_file:
                loaded = pk.load(pkl_file)
            return loaded
        except Exception as e:
            logger.warning("Could not open file: [{}]".format(
                join(filepath, filename + ".pkl")))
            logger.warning("Failure reason: [{}]".format(e))
            return False
    else:
        logger.warning("File not found at: [{}]".format(
            join(filepath, filename + ".pkl")))
コード例 #2
0
ファイル: txt_loader.py プロジェクト: SamujjwalSam/XC_GCN
    def read_titles(self,
                    classes_keys=None,
                    title_path=None,
                    title_file="titles.txt",
                    encoding=config["text_process"]["encoding"]):
        """
        Reads the titles.txt file and returns a OrderedDict of id : title.

        :param classes_keys: List of sample2cats keys to check only those keys are stored.
        :param title_file:
        :param title_path:
        :param encoding:
        :return:
        """
        logger.info(
            "Reads the titles.txt file and returns a OrderedDict of id : title."
        )
        titles = OrderedDict()
        if title_path is None: title_path = join(self.raw_txt_dir, title_file)
        with sopen(title_path, encoding=encoding) as raw_title_ptr:
            for cnt, line in enumerate(raw_title_ptr):
                line = line.split()
                if classes_keys is None or line[
                        0] in classes_keys:  # Add this sample if corresponding sample2cats exists.
                    titles[line[0].strip()] = " ".join(line[1:]).strip()
        return titles
コード例 #3
0
ファイル: File_Util.py プロジェクト: SamujjwalSam/XC_GCN
def save_pickle(data, filename, filepath, overwrite=False):
    """
    Saves python object as pickle file.

    :param data:
    :param filename:
    :param filepath:
    :param overwrite:
    :return:
    """
    # logger.debug("Method: save_pickle(data, filename, filepath, overwrite=False)")
    logger.info("Writing to pickle file: [{}]".format(
        join(filepath, filename + ".pkl")))
    if not overwrite and exists(join(filepath, filename + ".pkl")):
        logger.warning(
            "File [{}] already exists and Overwrite == False.".format(
                join(filepath, filename + ".pkl")))
        return True
    try:
        if isfile(join(filepath, filename + ".pkl")):
            logger.info("Overwriting on pickle file: [{}]".format(
                join(filepath, filename + ".pkl")))
        with sopen(join(filepath, filename + ".pkl"), 'wb') as pkl_file:
            pk.dump(data, pkl_file)
        pkl_file.close()
        return True
    except Exception as e:
        logger.warning("Could not write to pickle file: [{}]".format(
            join(filepath, filename + ".pkl")))
        logger.warning("Failure reason: [{}]".format(e))
        return False
コード例 #4
0
ファイル: File_Util.py プロジェクト: SamujjwalSam/XC_GCN
def write_file(data,
               filename,
               filepath='',
               overwrite=False,
               mode='w',
               encoding="utf-8",
               date_time_tag='',
               verbose=False):
    """

    :param verbose:
    :param encoding:
    :param data:
    :param filename:
    :param filepath:
    :param overwrite:
    :param mode:
    :param date_time_tag:
    :return:
    """
    if not overwrite and exists(
            join(filepath, date_time_tag + filename + ".txt")):
        # logger.warning("File [{}] already exists and Overwrite == False.".format(
        #     join(filepath, date_time_tag + filename + ".txt")))
        return True
    with sopen(join(filepath, date_time_tag + filename + ".txt"),
               mode,
               encoding=encoding) as text_file:
        if verbose:
            logger.info("Saving text file: [{}]".format(
                join(filepath, date_time_tag + filename + ".txt")))
        text_file.write(str(data))
        text_file.write("\n")
        text_file.write("\n")
    text_file.close()
コード例 #5
0
def loadPostgres():

    # Load to postgres

    csv_files = get_s3_keys(bucket, prefix, 'csv')
    csv_path = "s3://%s/%s" % (bucket, csv_files[0])
    print(csv_path)

    conn_string = "host=%s dbname=%s user=%s password=%s" % (
        postgres_server, dbname, dbuser, password)
    conn = psycopg2.connect(conn_string)
    cur = conn.cursor()

    cur.execute("""CREATE TABLE IF NOT EXISTS top1000(
				title text,
				budget float8,
				year integer,
				revenue float8,
				popularity float8,
				ratio float8,
				companiesList text,
				url text,
				abstract text
	)
	""")
    cur.execute("""DELETE FROM top1000""")
    conn.commit()

    with sopen(csv_path, 'r') as f:
        cur.copy_from(f, 'top1000', sep='\t')

    conn.commit()
コード例 #6
0
def uploadFile(req):
    with sopen("gs://garage-labs/chinese-whisperer/latest.webm", "wb") as fp:
        content = req.files.get('file').read()
        fp.write(content)

    response = flask.jsonify({'some': 'data'})
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
コード例 #7
0
def transfer(file, mpx_id):
    url = "http://{0}{1}".format(ns_host, file)
    bucket = destination(file)
    try:
        with sopen(url, 'rb', 1024*500, transport_params=dict(headers=auth(file))) as fin:
            with sopen(bucket, 'wb', transport_params=dict(session=session)) as fout:
                while True:
                    buffer = fin.read(1024*2)
                    if not buffer:
                        fin.close()
                        break
                    else:
                        fout.write(buffer)

        cleanup(file, mpx_id)
    except Exception as e:
        manage_threads(file, mpx_id)
        logger(e, 'threads', 'error')
        logger("NOT TRANSFERED - {0}".format(file), 'info', 'error')
コード例 #8
0
ファイル: search.py プロジェクト: papower1/pyTextUtil
def contains(inputFile, outputFile, search, regex):
	with open(inputFile, 'r') as input, sopen(outputFile) as output:
		text = input.readlines()
		for line in text:
			if not line:
				break
			if regex and re.search(search, line) != None:
				output.write(line)
			elif not regex and search in line:
				output.write(line)
コード例 #9
0
ファイル: delete.py プロジェクト: papower1/pyTextUtil
def duplicatedLines(inputFile, outputFile, withoutBlankLines):
	with open(inputFile, 'r') as input, sopen(outputFile) as output:
		text = input.readlines()
		linesSeen = set()
		for line in text:
			if line not in linesSeen:
				output.write(line)
				if withoutBlankLines and line.strip()=='':
					pass
				else:
					linesSeen.add(line)
コード例 #10
0
ファイル: File_Util.py プロジェクト: SamujjwalSam/XC_GCN
def load_json(filename: str,
              filepath: str = '',
              date_time_tag: str = '',
              ext: str = ".json",
              show_path: bool = False) -> OrderedDict:
    """
    Loads json file as python OrderedDict.

    :param show_path:
    :param ext: Should extension be appended?
    :param filename:
    :param filepath:
    :param date_time_tag:
    :return: OrderedDict
    """
    file_loc = join(filepath, date_time_tag + filename + ext)
    if show_path:
        logger.info("Reading JSON file: [{}]".format(file_loc))
    if exists(join(filepath, date_time_tag + filename + ext)):
        try:
            with sopen(file_loc, encoding="utf-8") as file:
                json_dict = json.load(file)
                json_dict = OrderedDict(json_dict)
                # json_dict = OrderedDict(json.load(file))
            file.close()
            return json_dict
        except Exception as e:
            logger.warning(
                "Could not open file as JSON: [{}]. \n Reason:[{}]".format(
                    file_loc, e))
            with sopen(file_loc, encoding="utf-8") as file:
                json_dict = str(file)
                json_dict = json.loads(json_dict)
                # json_dict = OrderedDict(json_dict)
            return json_dict
    else:
        logger.warning("File does not exist at: [{}]".format(file_loc))
        return False
コード例 #11
0
ファイル: _s3fs.py プロジェクト: emedgene/s3fs
 def _sopen(self, key, *args, **kwargs):
     creds = f"{self.aws_access_key_id}:{self.aws_secret_access_key}"
     server_port = ""
     if self.endpoint_url:
         parsed_url = urlparse(self.endpoint_url)
         if parsed_url.netloc:
             server_port = parsed_url.netloc
         else:
             server = parsed_url.path.split("/")[0]
             server_port = f"{server}:80"
     bucket_path = f"{self._bucket_name}/{key}"
     string = "s3://" + "@".join(
         i for i in [creds, server_port, bucket_path] if i)
     return sopen(string, *args, **kwargs)
コード例 #12
0
ファイル: search.py プロジェクト: papower1/pyTextUtil
def contains(inputFile, outputFile, search, regex):
	total = 0
	searched = 0
	with open(inputFile, 'r') as input, sopen(outputFile) as output:
		text = input.readlines()
		for line in text:
			total += 1
			if not line:
				break
			if regex and re.search(search, line) != None:
				output.write(line)
				searched+=1	
			elif not regex and search in line:
				output.write(line)
				searched+=1

	log.info('Total : %dsentences.' % total)
	log.info('Searched : %dsentences.(%d%%)' % ( searched, 100*(float(searched)/float(total)) ) )
コード例 #13
0
ファイル: File_Util.py プロジェクト: SamujjwalSam/XC_GCN
def save_json(data,
              filename,
              filepath='',
              overwrite=False,
              indent=2,
              date_time_tag=''):
    """

    :param data:
    :param filename:
    :param filepath:
    :param overwrite:
    :param indent:
    :param date_time_tag:
    :return:
    """
    logger.info("Saving JSON file: [{}]".format(
        join(filepath, date_time_tag + filename + ".json")))
    if not overwrite and exists(
            join(filepath, date_time_tag + filename + ".json")):
        logger.warning(
            "File [{}] already exists and Overwrite == False.".format(
                join(filepath, date_time_tag + filename + ".json")))
        return True
    try:
        with sopen(join(filepath, date_time_tag + filename + ".json"),
                   'w') as json_file:
            try:
                json_file.write(json.dumps(data, indent=indent))
            except Exception as e:
                logger.warning("Writing JSON failed: [{}]".format(e))
                logger.warning("Writing as string: [{}]".format(
                    join(filepath, date_time_tag + filename + ".json")))
                json_file.write(json.dumps(str(data), indent=indent))
                return True
        json_file.close()
        return True
    except Exception as e:
        logger.warning("Writing JSON file [{}] failed: [{}]".format(
            join(filepath, filename), e))
        logger.warning("Writing as TXT: [{}]".format(filename + ".txt"))
        write_file(data, filename, date_time_tag=date_time_tag)
        return False
コード例 #14
0
ファイル: trim.py プロジェクト: papower1/pyTextUtil
def trim(inputFile, outputFile, lines):
	total = 0
	notTrimmed = 0
	with open(inputFile, 'r') as input, sopen(outputFile) as output:
		text = input.readlines()
		for line in text:
			total += 1
			if not line:
				break
			if lines: #trim blank lines
				if not re.search('^$', line):
					output.write(line)
					notTrimmed+=1
			
			else:
				stripline = line.strip()
				output.write(stripline+'\n')
				if stripline == line[:-1]:
					notTrimmed+=1
	log.info('Total : %dsentences.' % total)
	log.info('Trimmed : %dsentences.(%d%%)' % (total-notTrimmed, 100*(float(total-notTrimmed)/float(total))))
コード例 #15
0
def read_green_taxi_csv(url, fobj):
    """
    Read a "green taxi" CSV file from the New York City Taxi and
    Limousine Commission (TLC) trip dataset, clean the data and write
    the data into the provided binary file object.

    Note: tested against only one specific file:

      https://nyc-tlc.s3.us-east-1.amazonaws.com/trip%20data/green_tripdata_2013-09.csv
    """

    # smart-open makes it easy to open a file via HTTP(S), S3, GCS,
    # local etc URLs.
    with sopen(url, mode='rb') as fobj_src:
        # The first line should be the header. Validate that it's what
        # we expect.
        line = fobj_src.readline()
        if line.rstrip().decode(ENCODING).split(',') != HEADER:
            raise InvalidHeaderError(line)
        # Ignore any whitespace-only lines between the header and data.
        # Return if we encounter the end of the file.
        while True:
            offset = fobj_src.tell()
            line = fobj_src.readline()
            if line.rstrip():
                fobj_src.seek(offset)
                break
            if not line:
                return
        # Ensure that there are at least 20 fields and preserve only
        # these fields via regexp. The data has an odd structure in that
        # there are additional trailing empty fields, which we ignore.
        for line in fobj_src:
            match = PATTERN_DATA.match(line)
            if not match:
                raise InvalidDataError(line)
            fobj.write(match.group(1))
            fobj.write(b'\n')
コード例 #16
0
ファイル: txt_loader.py プロジェクト: SamujjwalSam/XC_GCN
    def read_classes(self,
                     classes_dir=None,
                     classes_file="cats.txt",
                     encoding=config["text_process"]["encoding"]):
        """
        Reads the cats.txt file and returns a OrderedDict of id : class ids.

        :param classes_file:
        :param classes_dir:
        :param encoding:
        :return:
        """
        logger.info(
            "Reads the cats.txt file and returns a OrderedDict of id : class ids."
        )
        cat_line_phrase = "  "  # Phrase to recognize lines with category information.
        cat_sep_phrase = ", "  # Phrase to separate cats.
        classes = OrderedDict()
        cat_pool = set()
        if classes_dir is None: classes_dir = self.raw_txt_dir
        with sopen(join(classes_dir, classes_file),
                   encoding=encoding) as raw_cat_ptr:
            sample_idx = raw_cat_ptr.readline().strip()
            for cnt, line in enumerate(raw_cat_ptr):
                if cat_line_phrase in line:
                    cats = line.split(
                        cat_sep_phrase
                    )  # Splliting line based on ', ' to get cats.
                    cats = [x.strip() for x in cats
                            ]  # Removing extra characters like: ' ','\n'.
                    cat_pool.update(cats)
                else:
                    classes[sample_idx] = list(cat_pool)
                    cat_pool.clear()
                    sample_idx = line.strip()

        return classes
コード例 #17
0
ファイル: txt_loader.py プロジェクト: SamujjwalSam/XC_GCN
    def read_desc(self,
                  classes_keys=None,
                  desc_path=None,
                  desc_file="descriptions.txt",
                  encoding=config["text_process"]["encoding"]):
        """
        Reads the descriptions.txt file and returns a OrderedDict of id : desc.

        :param classes_keys:
        :param desc_file:
        :param desc_path:
        :param encoding:
        :return:
        """
        id_phrase = "product/productId: "  # Phrase to recognize lines with sample id.
        id_remove = 19  # Length of [id_phrase], to be removed from line.
        desc_phrase = "product/description: "  # Phrase to recognize lines with sample description.
        desc_remove = 21  # Length of [desc_phrase], to be removed from line.
        logger.info(
            "Reads the descriptions.txt file and returns a OrderedDict of id : desc."
        )
        descriptions = OrderedDict()
        if desc_path is None: desc_path = join(self.raw_txt_dir, desc_file)
        import itertools
        with sopen(desc_path, encoding=encoding) as raw_desc_ptr:
            for idx_line, desc_line in itertools.zip_longest(
                    *[raw_desc_ptr] *
                    2):  # Reads multi-line [2] per iteration.
                if id_phrase in idx_line:
                    sample_id = idx_line[id_remove:].strip()
                    if classes_keys is None or sample_id in classes_keys:  # Add this sample if corresponding class exists.
                        if desc_phrase in desc_line:
                            sample_desc = desc_line[desc_remove:].strip()
                        else:
                            sample_desc = None  # Even if 'description' is not found, we are not ignoring the sample as it might still have text in 'title'.
                        descriptions[sample_id] = sample_desc
        return descriptions
コード例 #18
0
ファイル: __init__.py プロジェクト: trisongz/hfsync
 def read(self, url_or_path):
     return sopen(url_or_path,
                  mode='rb',
                  transport_params=self.auth_client())
コード例 #19
0
ファイル: json_loader.py プロジェクト: SamujjwalSam/XC_GCN
    def gen_dicts(self,
                  json_path=None,
                  encoding=config["text_process"]["encoding"],
                  specials="""_-@*#'"/\\""",
                  replace=' '):
        """
        Generates the data dictionaries from original json file.

        :param replace: Character to replace with.
        :param specials: Characters to clean from txts.
        :param json_path: Path to raw json file.
        :param encoding: Encoding for the raw json file.
        :return: txts, sample2cats, cattext2catid_map, no_cat_ids
            no_cat_ids: ids for which no categories were found.
        """
        import ast  # As the data is not proper JSON (single-quote instead of double-quote) format, "json" library will not work.
        from unidecode import unidecode

        logger.info("Generates the data dictionaries from original json file.")
        txts = OrderedDict()
        classes = OrderedDict()
        cats = OrderedDict()
        no_cat_ids = []  # To store ids for which no cats were found.

        if json_path is None: json_path = self.raw_json_dir
        with sopen(json_path, encoding=encoding) as raw_json_ptr:
            trans_table = File_Util.make_trans_table(
                specials=specials,
                replace=replace)  # Creating mapping to clean txts.
            cat_idx = 0  # Holds the category index.
            for cnt, line in enumerate(raw_json_ptr):
                # Instead of: line_dict = OrderedDict(json.loads(line));
                # Use: import ast; line_dict = ast.literal_eval(line.strip().replace('\n','\\n'));
                line_dict = ast.literal_eval(line.strip().replace('\n', '\\n'))
                if "categories" in line_dict:  # Check if "cats" exists.
                    if "title" in line_dict:  # Check if "title" exists, add if True.
                        txts[line_dict["asin"]] = unidecode(
                            str(line_dict["title"])).translate(trans_table)
                        if "description" in line_dict:  # Check if "description" exists and append to "title" with keyword: ". \nDESC: ", if true.
                            txts[line_dict["asin"]] = txts[
                                line_dict["asin"]] + ". \nDESC: " + unidecode(
                                    str(line_dict["description"])).translate(
                                        trans_table)
                    else:
                        if "description" in line_dict:  # Check if "description" exists even though "title" does not, use only "description" if true.
                            txts[line_dict["asin"]] = ". \nDESC: " + line_dict[
                                "description"]
                        else:  # Report and skip the sample if neither "title" nor "description" exists.
                            logger.warning(
                                "Neither 'title' nor 'description' found for sample id: [{}]. Adding sample to 'no_cat_ids'."
                                .format(line_dict["asin"]))
                            no_cat_ids.append(
                                line_dict["asin"]
                            )  # As neither "title" nor "description" exists, adding the id to "no_cat_ids".
                            continue
                    classes[line_dict["asin"]] = line_dict["cats"][0]
                    for lbl in classes[line_dict["asin"]]:
                        if lbl not in cats:  # If lbl does not exists in cats already, add it and assign a new category index.
                            cats[lbl] = cat_idx
                            cat_idx += 1
                        classes[line_dict["asin"]][classes[
                            line_dict["asin"]].index(lbl)] = cats[
                                lbl]  # Replacing cats text to cats id.
                else:  # if "categories" does not exist, then add the id to "no_cat_ids".
                    no_cat_ids.append(line_dict["asin"])

        File_Util.save_json(no_cat_ids,
                            self.dataset_name + "_no_cat_ids",
                            filepath=self.dataset_dir)
        logger.info(
            "Number of txts: [{}], sample2cats: [{}] and cattext2catid_map: [{}]."
            .format(len(txts), len(classes), len(cats)))
        return txts, classes, cats
コード例 #20
0
ファイル: __init__.py プロジェクト: trisongz/hfsync
 def write(self, url_or_path):
     return sopen(url_or_path,
                  mode='wb',
                  transport_params=self.auth_client())