Ejemplo n.º 1
0
def main(dataset_path, percent_eval):
    OUTPUT_PATH = "/opt/ml/model"
    EXTRACT_PATH = "/home"
    TMP_PATH = "/home/tmp"

    if not os.path.exists(TMP_PATH):
        os.makedirs(TMP_PATH)
    if not os.path.exists(EXTRACT_PATH):
        os.makedirs(EXTRACT_PATH)

    try:
        shutil.copy(dataset_path, join(EXTRACT_PATH, 'data.tar'))
    except:
        print('unable to retrieve the dataset tar file.')
        sys.exit(1)
    with tarfile.open(join(EXTRACT_PATH, 'data.tar')) as tar_file:
        tar_file.extractall(join(EXTRACT_PATH, 'out'))

    if percent_eval > 100 or percent_eval < 100:
        percent_eval = 30
    json_to_csv.main(percent_eval)

    generate_tfrecord.main(TMP_PATH + "/train.csv", join(OUTPUT_PATH, 'train.record'))
    generate_tfrecord.main(TMP_PATH + "/eval.csv", join(OUTPUT_PATH, 'eval.record'))

    parse_meta.main(join(OUTPUT_PATH, 'map.pbtxt'))

    print(".\nRecords generated")
Ejemplo n.º 2
0
def query():

    patent_values = []  # patent 設定檔參數
    item_list = []  # 需要搜尋的id list
    results_found = 0

    # 搜尋設定檔
    parser = configparser.ConfigParser()
    parser.read("./query_config.cfg")

    # 讀取設定檔
    patent_values = get_config_value(parser, 0)

    item_list = list(
        set(
            open(os.path.join(
                patent_values[3],
                patent_values[2])).read().rstrip('\n').split('\n')))

    for item in item_list:
        n = query_one_patent(item, patent_values, results_found)
        results_found = n

    if results_found == 0:
        print("Query returned no results")
    else:
        # 透過json產生csv
        json_to_csv.main(patent_values[3], parser.sections()[0], results_found)

        # 清除csv重複資料並儲存
        output_filename = os.path.join(patent_values[3],
                                       parser.sections()[0] + '.csv')
        df = pd.read_csv(output_filename, dtype=object, encoding='Latin-1')
        df = df[patent_values[5]].drop_duplicates().sort_values(
            by=patent_values[6][0],
            ascending=[
                direction != 'desc' for direction in patent_values[6][1]
            ])
        df.to_csv(output_filename, index=False)
        print('({} rows returned)'.format(len(df)))
Ejemplo n.º 3
0
def main(dataset_paths, percent_eval, directory):

    OUTPUT_PATH = directory
    EXTRACT_PATH = "/home"
    TMP_PATH = "/home/tmp"

    if not os.path.exists(TMP_PATH):
        os.makedirs(TMP_PATH)
    if not os.path.exists(EXTRACT_PATH):
        os.makedirs(EXTRACT_PATH)

    try:
        for i in dataset_paths:
            shutil.copy(i, join(EXTRACT_PATH, 'data.tar'))
    except:
        print('unable to retrieve a dataset tar file:')
        sys.exit(1)
    for dataset in dataset_paths:
        with tarfile.open(dataset) as tar_file:
            tar_file.extractall(join(EXTRACT_PATH, 'out'))

    if percent_eval > 100 or percent_eval < 0:
        percent_eval = 30
    json_to_csv.main(percent_eval)
    try:

        generate_tfrecord.main(TMP_PATH + "/train.csv",
                               join(OUTPUT_PATH, 'train.record'))
        generate_tfrecord.main(TMP_PATH + "/eval.csv",
                               join(OUTPUT_PATH, 'eval.record'))

        parse_meta.main(join(OUTPUT_PATH, 'map.pbtxt'))

        print(".\nRecords generated")
    except ValueError:
        print(
            "The datasets provided do not have the same class labels. Please make sure that labels are spelt the same in both datasets, or label the same objects for both datasets."
        )
Ejemplo n.º 4
0
def query(configfile):
    # Query the PatentsView database using parameters specified in configfile
    parser = configparser.ConfigParser()
    parser.read(configfile)

    # Loop through the separate queries listed in the config file.
    for q in parser.sections():

        print("Running query: ", q)

        # Parse parameters from config file
        entity = json.loads(parser.get(q, 'entity'))
        url = 'http://www.patentsview.org/api/' + entity + '/query?'

        input_file = json.loads(parser.get(q, 'input_file'))
        directory = json.loads(parser.get(q, 'directory'))
        input_type = json.loads(parser.get(q, 'input_type'))
        fields = json.loads(parser.get(q, 'fields'))

        try:
            # If specified, 'sort' should be a list of dictionaries, specifying
            # the order of keys and direction of each key.

            sort = json.loads(parser.get(q, 'sort'))
            sort_fields, sort_directions = [], []
            for dct in sort:
                for field in dct:
                    # We can only sort by fields that are in the data
                    if field in fields:
                        sort_fields.append(field)
                        sort_directions.append(dct[field])
            if len(sort_fields) == 0:
                sort_fields = [fields[0]]
                sort_directions = ["asc"]
        except:
            sort_fields = [fields[0]]
            sort_directions = ["asc"]

        criteria = {
            "_and": [
                json.loads(parser.get(q, option))
                for option in parser.options(q)
                if option.startswith('criteria')
            ]
        }

        item_list = list(
            set(open(os.path.join(directory, input_file)).read().split('\n')))
        results_found = 0

        item_list_len = len(item_list)

        for item in item_list:
            params = {
                'q': {
                    "_and": [{
                        input_type: item
                    }, criteria]
                },
                'f': fields
            }

            r = requests.post(url, data=json.dumps(params))

            if 400 <= r.status_code <= 499:
                print("Client error when quering for value {}".format(item))
            elif r.status_code >= 500:
                print(
                    "Server error when quering for value {}. You may be exceeding the maximum API request size (1GB)."
                    .format(item))
            elif json.loads(r.text)['count'] != 0:
                outp = open(os.path.join(directory, q + '_' + \
                            str(results_found) + '.json'), 'w')
                print(r.text, end='', file=outp)
                outp.close()
                results_found += 1

        if results_found == 0:
            print("Query {} returned no results".format(q))
        else:
            # Output merged CSV of formatted results.
            json_to_csv.main(directory, q, results_found)

            # Clean csv: reorder columns, drop duplicates, sort, then save
            output_filename = os.path.join(directory, q + '.csv')
            df = pd.read_csv(output_filename, dtype=object, encoding='Latin-1')
            df = df[fields].drop_duplicates().sort_values(
                by=sort_fields,
                ascending=[
                    direction != 'desc' for direction in sort_directions
                ])
            df.to_csv(output_filename, index=False)
            print('({} rows returned)'.format(len(df)))
Ejemplo n.º 5
0
def main(dataset_paths, percent_eval, directory):
    ROOT_PATH, PATH_EXT = os.path.splitext(dataset_paths)
    DATASET_NAME = ROOT_PATH.split('/')[-1]

    OUTPUT_PATH = directory
    EXTRACT_PATH = "/home"
    TMP_PATH = "/home/tmp"

    # Placeholder for enum, here 1 is tar, 0 is ZIP
    NORMAL_MODE = 1  # Assume this is a tar

    if not os.path.exists(TMP_PATH):
        os.makedirs(TMP_PATH)
    if not os.path.exists(EXTRACT_PATH):
        os.makedirs(EXTRACT_PATH)

    if PATH_EXT == '.zip':
        print(
            '.zip file extension found, interpreting as tensorflow object detection csv zip'
        )
        NORMAL_MODE = 0  # Not a tar file

    if NORMAL_MODE:  # Perform working tar code
        print("normal mode")

        try:
            shutil.copy(dataset_paths, join(EXTRACT_PATH, 'data.tar'))
        except:
            print('unable to retrieve a dataset tar file:')
            sys.exit(1)

        with tarfile.open(dataset_paths) as tar_file:
            tar_file.extractall(join(EXTRACT_PATH, 'out'))

        if percent_eval > 100 or percent_eval < 0:
            percent_eval = 30
        json_to_csv.main(percent_eval)
        try:

            generate_tfrecord.main(TMP_PATH + "/train.csv",
                                   join(OUTPUT_PATH, 'train.record'),
                                   NORMAL_MODE, "/home/")
            generate_tfrecord.main(TMP_PATH + "/eval.csv",
                                   join(OUTPUT_PATH, 'eval.record'),
                                   NORMAL_MODE, "/home/")

            parse_meta.main(join(OUTPUT_PATH, 'map.pbtxt'), NORMAL_MODE,
                            TMP_PATH + "/eval.csv")

            print(".\nRecords generated")
        except ValueError:
            print(
                "The datasets provided do not have the same class labels. Please make sure that labels are spelt the same in both datasets, or label the same objects for both datasets."
            )

    if not NORMAL_MODE:
        print('treating as zip of tf obj detect')
        #Psuedocode

        #Unzip the zip in correct dir
        with zipfile.ZipFile(
                dataset_paths, 'r'
        ) as zip_file:  # Unzip the file (Assuming 1 zip at this time)
            namelist = zip_file.namelist()[-1]
            if any(
                [namelist.startswith(i) for i in ["valid", "train", "test"]]):
                zip_file.extractall(EXTRACT_PATH + "/" + DATASET_NAME)
            else:
                zip_file.extractall(EXTRACT_PATH)
            from fnmatch import fnmatch

            pattern = "*.csv"

            for path, subdirs, files in os.walk(EXTRACT_PATH):
                for name in files:
                    if fnmatch(name, pattern):
                        print("CSV:", os.path.join(path, name))

        #Generate the records
#         try:
        print(EXTRACT_PATH + "/" + DATASET_NAME + "/test/_annotations.csv")
        generate_tfrecord.main(
            EXTRACT_PATH + "/" + DATASET_NAME + "/test/_annotations.csv",
            join(OUTPUT_PATH, 'eval.record'), NORMAL_MODE,
            EXTRACT_PATH + "/" + DATASET_NAME + "/test/")
        generate_tfrecord.main(
            EXTRACT_PATH + "/" + DATASET_NAME + "/train/_annotations.csv",
            join(OUTPUT_PATH, 'train.record'), NORMAL_MODE,
            EXTRACT_PATH + "/" + DATASET_NAME + "/train/")

        print('main records generated')
        parse_meta.main(
            join(OUTPUT_PATH,
                 'map.pbtxt'), NORMAL_MODE, EXTRACT_PATH + "/" + DATASET_NAME +
            "/train/_annotations.csv")  # Edge case of missing label in one csv

        print(".\nRecords generated")