def main(): """ """ # Check if output directories exist commons.check_path(args) # Print start messages commons.show_summary(args) commons.show_networking(args) # globals: proxies, torsocks # Read config.yaml commons.read_config(args) # globals: config # Recompile exclusions commons.recompile_exclusions() # globals: exclusions # Create queues url_queue = commons.create_queue("url_queue") # Create threads commons.UrlQueueManager(args, url_queue) # Read file containing URLs urls = commons.read_file(args.input_file) # Process URLs for url in urls: if not url.startswith("http"): url = "http://{}".format(url) url_queue.put(url) url_queue.join() return
def generate_matrix(test_file, tags_file, output_file=None): tracks, tags, extra = commons.read_file(test_file) tags_data = pd.read_csv(tags_file, delimiter='\t', header=None) tag_map = {} for index, tag_str in enumerate(tags_data[0]): category, tag = tag_str.split(commons.TAG_HYPHEN) if category not in tag_map: tag_map[category] = {} tag_map[category][tag] = index data = np.zeros([len(tracks), len(tags_data[0])], dtype=bool) for i, track in enumerate(tracks.values()): for category in commons.CATEGORIES: for tag in track[category]: data[i][tag_map[category][tag]] = True if output_file is not None: np.save(output_file, data) return data
def main(): """ """ # Check if output directories exist commons.check_path(args) # Print start messages commons.show_summary(args) commons.show_networking(args, uagent) # Read suspicious.yaml and external.yaml suspicious = commons.read_externals() # Recompile exclusions commons.recompile_exclusions() # Build dict of extensions extensions = {} extensions.update(suspicious["archives"]) extensions.update(suspicious["files"]) # Read file containing URLs urls = commons.read_file(args.input_file) # Create queues recursion_queue = commons.create_queue("recursion_queue") # Create threads commons.RecursiveQueueManager(args, recursion_queue, uagent, extensions) # Process URLs for url in urls: if not (url.startswith("http://") or url.startswith("https://")): continue recursion_queue.put(url) recursion_queue.join() return
def filter_subset(tracks, tags_subset): tracks_to_delete = [] for track_id, track in tracks.items(): total_tags = 0 for category, tags_new in tags_subset.items(): track[category] &= tags_new total_tags += len(track[category]) if total_tags == 0: tracks_to_delete.append(track_id) for track in tracks_to_delete: tracks.pop(track) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Filters out tags according to tag list and removes tracks with no ' 'tags left') parser.add_argument('tsv_file', help='TSV file with such columns: TRACK_ID, ARTIST_ID, ALBUM_ID, PATH, DURATION, ' 'TAGS') parser.add_argument('tags_file', help='file with list of tag subset') parser.add_argument('output_file', help='output tsv file') args = parser.parse_args() tracks, tags, extra = commons.read_file(args.tsv_file) tags_subset = read_tags_file(args.tags_file) filter_subset(tracks, tags_subset) commons.write_file(tracks, args.output_file, extra)
tags_subset = None if args.subset_file is not None: tags_subset = read_tags_file(args.subset_file) split_dirs = [ split_dir for split_dir in Path(args.directory).iterdir() if split_dir.is_dir() ] for split_dir in split_dirs: for part in PARTS: input_file = split_dir / (args.input_prefix + '-' + part + '.tsv') output_file = split_dir / (args.output_prefix + '-' + part + '.tsv') tracks, tags, extra = commons.read_file(input_file) if tags_subset is not None: filter_subset(tracks, tags_subset) if args.category is not None: tracks = filter_category(tracks, tags, args.category) if args.sort: tracks = { track_id: tracks[track_id] for track_id in sorted(tracks) } commons.write_file(tracks, output_file, extra)
data[:, tag_index] = True return data ALGORITHMS = {'popular': predict_popular} DEFAULT_ALGORITHM = 'popular' if __name__ == '__main__': parser = argparse.ArgumentParser( description='Generates predictions based on naive baseline algorithms') parser.add_argument('train_file', help=commons.METADATA_DESCRIPTION) parser.add_argument('test_file', help=commons.METADATA_DESCRIPTION) parser.add_argument('tags_file', help='file with tag order that will be used') parser.add_argument('output_file', help='output NPY file ') parser.add_argument('--algorithm', choices=ALGORITHMS.keys(), default=DEFAULT_ALGORITHM, help='algorithm to use') args = parser.parse_args() func = ALGORITHMS[args.algorithm] train_tracks, train_tags, _ = commons.read_file(args.train_file) test_tracks, test_tags, _ = commons.read_file(args.test_file) tags_order = pd.read_csv(args.tags_file, delimiter='\t', header=None) data = ALGORITHMS[args.algorithm](train_tracks, train_tags, test_tracks, test_tags, tags_order) np.save(args.output_file, data)