コード例 #1
0
def main():
    """ """
    # Check if output directories exist
    commons.check_path(args)

    # Print start messages
    commons.show_summary(args)
    commons.show_networking(args)  # globals: proxies, torsocks

    # Read config.yaml
    commons.read_config(args)  # globals: config

    # Recompile exclusions
    commons.recompile_exclusions()  # globals: exclusions

    # Create queues
    url_queue = commons.create_queue("url_queue")

    # Create threads
    commons.UrlQueueManager(args, url_queue)

    # Read file containing URLs
    urls = commons.read_file(args.input_file)

    # Process URLs
    for url in urls:
        if not url.startswith("http"):
            url = "http://{}".format(url)
        url_queue.put(url)

    url_queue.join()
    return
コード例 #2
0
def generate_matrix(test_file, tags_file, output_file=None):
    tracks, tags, extra = commons.read_file(test_file)

    tags_data = pd.read_csv(tags_file, delimiter='\t', header=None)
    tag_map = {}
    for index, tag_str in enumerate(tags_data[0]):
        category, tag = tag_str.split(commons.TAG_HYPHEN)
        if category not in tag_map:
            tag_map[category] = {}
        tag_map[category][tag] = index

    data = np.zeros([len(tracks), len(tags_data[0])], dtype=bool)
    for i, track in enumerate(tracks.values()):
        for category in commons.CATEGORIES:
            for tag in track[category]:
                data[i][tag_map[category][tag]] = True

    if output_file is not None:
        np.save(output_file, data)

    return data
コード例 #3
0
def main():
    """ """
    # Check if output directories exist
    commons.check_path(args)

    # Print start messages
    commons.show_summary(args)
    commons.show_networking(args, uagent)

    # Read suspicious.yaml and external.yaml
    suspicious = commons.read_externals()

    # Recompile exclusions
    commons.recompile_exclusions()

    # Build dict of extensions
    extensions = {}
    extensions.update(suspicious["archives"])
    extensions.update(suspicious["files"])

    # Read file containing URLs
    urls = commons.read_file(args.input_file)

    # Create queues
    recursion_queue = commons.create_queue("recursion_queue")

    # Create threads
    commons.RecursiveQueueManager(args, recursion_queue, uagent, extensions)

    # Process URLs
    for url in urls:
        if not (url.startswith("http://") or url.startswith("https://")):
            continue

        recursion_queue.put(url)

    recursion_queue.join()
    return
コード例 #4
0

def filter_subset(tracks, tags_subset):
    tracks_to_delete = []
    for track_id, track in tracks.items():
        total_tags = 0
        for category, tags_new in tags_subset.items():
            track[category] &= tags_new
            total_tags += len(track[category])
        if total_tags == 0:
            tracks_to_delete.append(track_id)

    for track in tracks_to_delete:
        tracks.pop(track)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Filters out tags according to tag list and removes tracks with no '
                                                 'tags left')
    parser.add_argument('tsv_file', help='TSV file with such columns: TRACK_ID, ARTIST_ID, ALBUM_ID, PATH, DURATION, '
                                         'TAGS')
    parser.add_argument('tags_file', help='file with list of tag subset')
    parser.add_argument('output_file', help='output tsv file')

    args = parser.parse_args()

    tracks, tags, extra = commons.read_file(args.tsv_file)
    tags_subset = read_tags_file(args.tags_file)
    filter_subset(tracks, tags_subset)
    commons.write_file(tracks, args.output_file, extra)
コード例 #5
0
    tags_subset = None
    if args.subset_file is not None:
        tags_subset = read_tags_file(args.subset_file)

    split_dirs = [
        split_dir for split_dir in Path(args.directory).iterdir()
        if split_dir.is_dir()
    ]
    for split_dir in split_dirs:
        for part in PARTS:
            input_file = split_dir / (args.input_prefix + '-' + part + '.tsv')
            output_file = split_dir / (args.output_prefix + '-' + part +
                                       '.tsv')

            tracks, tags, extra = commons.read_file(input_file)

            if tags_subset is not None:
                filter_subset(tracks, tags_subset)

            if args.category is not None:
                tracks = filter_category(tracks, tags, args.category)

            if args.sort:
                tracks = {
                    track_id: tracks[track_id]
                    for track_id in sorted(tracks)
                }

            commons.write_file(tracks, output_file, extra)
コード例 #6
0
    data[:, tag_index] = True
    return data


ALGORITHMS = {'popular': predict_popular}

DEFAULT_ALGORITHM = 'popular'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Generates predictions based on naive baseline algorithms')
    parser.add_argument('train_file', help=commons.METADATA_DESCRIPTION)
    parser.add_argument('test_file', help=commons.METADATA_DESCRIPTION)
    parser.add_argument('tags_file',
                        help='file with tag order that will be used')
    parser.add_argument('output_file', help='output NPY file ')
    parser.add_argument('--algorithm',
                        choices=ALGORITHMS.keys(),
                        default=DEFAULT_ALGORITHM,
                        help='algorithm to use')
    args = parser.parse_args()

    func = ALGORITHMS[args.algorithm]
    train_tracks, train_tags, _ = commons.read_file(args.train_file)
    test_tracks, test_tags, _ = commons.read_file(args.test_file)
    tags_order = pd.read_csv(args.tags_file, delimiter='\t', header=None)

    data = ALGORITHMS[args.algorithm](train_tracks, train_tags, test_tracks,
                                      test_tags, tags_order)
    np.save(args.output_file, data)