def main():
	parser = make_arg_parser()
	args = parser.parse_args()
	# Parse command line
	tanimoto = args.tanimoto
	with open(args.mpfa, 'r') as inf:
		# Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
		cluster_map = build_cluster_map(inf, bread=args.bread)
	with open(args.input, 'r') as inf2:
		inkey = generate_index_list(inf2)
		print('\nOk, processing input file...\n')
	with open(args.input, 'r') as in_csv2:
		headers = generate_chunk_list(in_csv2)
	c_list = list(cluster_map.keys())
	grabbed_clusters = []
	data_to_pool = []
	# print(c_list)
	for cluster in c_list:
		grab = pick_a_cluster(headers, cluster)  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
		# print(grab)
		if not grab:
			pass
		else:
			# print(grab)
			grabbed_clusters.extend([cluster])
			with open(args.input, 'r') as inf3:
				mx = pd.read_csv(inf3, sep=',', header=0, usecols=grab, engine='c')  # loads in only the columns from the grab list, i.e. all cols for a unique cluster
			mx.index = inkey  # reindexes the df with the orf labels after importing specific columns with usecols
			data_to_pool.append(mx)
	dlen = len(data_to_pool)
	print('Built the data list of %s clusters' % dlen)
	args_list = [cluster_map, c_list]  # organizes all the arguments that the parallelized function needs into a list
	print('\nSending data to Workers... work, Workers, work!\n')
	if args.tanimoto:
		if __name__ == '__main__':
			results = list(futures.map(partial(parallel_tanimoto, args_list=args_list), data_to_pool))
			outdf = pd.concat(results, axis=1)
	if not args.tanimoto:
		if __name__ == '__main__':
			results = list(futures.map(partial(parallel_minicluster, args_list=args_list), data_to_pool))
			outdf = pd.concat(results, axis=1)
		# bigmat = pd.concat(results, axis=0)  # stack all the results into a single column in a dataframe
		# print(bigmat.shape[0])
		# bigmat.index = c_list  # now the index is just the clusters, not the orfs
		# print(bigmat)
	print('File processing complete; writing output file...\n')
	del data_to_pool
	with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
		# outdf = pd.concat(results, axis=1)
		outdf.columns = grabbed_clusters  # names the columns (and index, next line) according to clusters in the order they were processed
		outdf.index = c_list
		outdf.sort_index(axis=0, inplace=True)
		outdf.sort_index(axis=1, inplace=True)
		outdf = outdf.round(decimals=3)
		outdf.to_csv(outf)
Example #2
0
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    # Parse command line
    cpus = args.cpus
    num_cpus = cpu_count()
    tanimoto = args.tanimoto
    if cpus > num_cpus:
        print(
            '\nError: Number of requested processors exceeds hardware available!'
        )
        print('Maximum processors available is %s.\n' % num_cpus)
        sys.exit()
    with open(args.mpfa, 'r') as inf:
        # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
        cluster_map = build_cluster_map(inf, bread=args.bread)
        # intype = str(args.input).split('.')[-1]
    with open(args.input, 'r') as inf2:
        inkey = generate_index_list(inf2)
    with open(args.input, 'r') as in_csv2:
        headers = generate_chunk_list(in_csv2)
    c_list = list(cluster_map.keys())
    ct = len(c_list)
    print('Found %d clusters...' % ct)
    results_list = []
    grabbed_clusters = []
    j = 0
    for cluster in c_list:
        grab = pick_a_cluster(
            headers, cluster
        )  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
        # print(grab)
        if not grab:
            pass
        else:
            grabbed_clusters.extend([cluster])
            with open(args.input, 'r') as inf3:
                bigmat = big_cluster_completeness(inf3, grab, inkey, cluster,
                                                  cluster_map, cpus, tanimoto,
                                                  c_list, j)
            results_list.append(
                bigmat
            )  # returns a list of dataframes, one for each cluster column
            j += 1
    print('File processing complete; writing output file...\n')
    with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
        outdf = pd.concat(results_list, axis=1)
        outdf.columns = grabbed_clusters  # names the columns (and index, next line) according to clusters in the order they were processed
        outdf.index = c_list
        outdf.sort_index(axis=0, inplace=True)
        outdf.sort_index(axis=1, inplace=True)
        outdf = outdf.round(decimals=3)
        outdf.to_csv(outf)
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    # Parse command line
    with open(args.mpfa, 'r') as inf:
        # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
        cluster_map = build_cluster_map(inf, bread=args.bread)
    with open(args.input, 'r') as in_csv:
        print('\nOk, processing input file in pieces...\n')
        inkey = generate_index_list(in_csv)
        # print(len(inkey))
    with open(args.input, 'r') as in_csv2:
        headers = generate_chunk_list(in_csv2)
        # print(len(headers))
        c_list = list(cluster_map.keys())
        # ct = len(c_list)
        # print('Found %d clusters...' % ct)
        data_to_pool = []
        grabbed_clusters = []
    for cluster in c_list:
        grab = pick_a_cluster(
            headers, cluster
        )  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
        if not grab:
            pass
        else:
            # print(grab)
            grabbed_clusters.extend([cluster])
            with open(args.input, 'r') as inf3:
                mx = pd.read_csv(
                    inf3, sep=',', header=0, usecols=grab, engine='c'
                )  # loads in only the columns from the grab list, i.e. all cols for a unique cluster
            mx.index = inkey  # reindexes the df with the orf labels after importing specific columns with usecols
            data_to_pool.append(
                mx)  # create the list of dfs to map over for multiprocessing
    if __name__ == '__main__':
        print('\nSending data to Workers... work, Workers, work!')
        results = list(
            futures.map(partial(parallel_clustermean, c_list=c_list),
                        data_to_pool))
        print('\nFile processing complete; writing output file...\n')
        del data_to_pool
    with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
        outdf = pd.concat(results, axis=1)
        outdf.columns = grabbed_clusters  # names the columns (and index, next line) according to clusters in the order they were processed
        # outdf.index = c_list
        outdf.sort_index(
            axis=0, inplace=True
        )  # ensure that the clusters are in order on cols and rows
        outdf.sort_index(axis=1, inplace=True)
        outdf.to_csv(outf)
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    # Parse command line
    if args.synthesize:
        final_df = synthesize_chunks()
        with open(args.output, 'w') as outf:
            final_df.to_csv(outf)
            print('\nMerged data written to file... exiting...\n')
            sys.exit()
    with open(args.mpfa, 'r') as inf:
        # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
        cluster_map = build_cluster_map(inf, bread=args.bread)
    with open(args.input, 'r') as in_csv:
        print('\nOk, processing input file...\n')
        big_df = pd.read_csv(in_csv,
                             sep=',',
                             header=0,
                             index_col=0,
                             engine='c')
    inkey = list(big_df.index)
    # inkey = generate_index_list(in_csv)
    c_list = list(cluster_map.keys())
    ct = len(c_list)
    n = int(args.cutsize)
    print('Found %d clusters... Making groups of %d clusters...' % (ct, n))
    # Make a list of lists of clusters, to guide the breaking up of the csv
    bcl = [c_list[i:i + n] for i in range(0, len(c_list), n)]
    print('\nMaster list generated... now doing the splits!')
    p = 1
    for c in bcl:
        grab_chunk = []
        for cluster in list(c):
            grab = pick_a_cluster(
                inkey, cluster
            )  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
            grab_chunk.extend(grab)
        chunk_df = big_df[grab_chunk]
        outf = args.output
        if outf.endswith('.csv'):
            outf.replace('.csv', '')
        outf = '_'.join([outf, str(p), '.csv'])
        chunk_df.to_csv(outf)
        print('\nSaved matrix chunk %d...' % p)
        del chunk_df
        p += 1
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    # Parse command line
    if args.synthesize:
        final_df = synthesize_chunks()
        with open(args.output, 'w') as outf:
            final_df.to_csv(outf)
            print('\nMerged data written to file... exiting...\n')
            sys.exit()
    with open(args.mpfa, 'r') as inf:
        # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
        cluster_map = build_cluster_map(inf, bread=args.bread)
    with open(args.input, 'r') as in_csv:
        print('\nOk, processing input file in pieces...\n')
        inkey = generate_index_list(in_csv)
    c_list = list(cluster_map.keys())
    ct = len(c_list)
    n = int(args.cuts)
    print('Found %d clusters... Making %d cuts...' % (ct, n))
    # g = orfct / cuts
    # if not g.is_integer():
    # 	g = int(g) + 1
    bcl = [c_list[i:i + n] for i in range(0, len(c_list), n)]
    print('\nMaster list generated... now doing the splits!')
    p = 1
    for c in bcl:
        grab_chunk = []
        for cluster in list(c):
            grab = pick_a_cluster(
                inkey, cluster
            )  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
            grab_chunk.extend(grab)
        with open(args.input, 'r') as inf3:
            mx = pd.read_csv(
                inf3, sep=',', header=0, usecols=grab_chunk, engine='c'
            )  # loads in only the columns from the grab list, i.e. all cols for a unique cluster
        mx.index = inkey  # reindexes the df with the orf labels after importing specific columns with usecols
        # data_to_pool.append(mx)  # create the list of dfs to map over for multiprocessing
        outf = args.output
        if outf.endswith('.csv'):
            outf.replace('.csv', '')
        outf = '_'.join([args.output, str(p), '.csv'])
        mx.to_csv(outf)
        print('\nSaved a matrix chunk...')
        p += 1
Example #6
0
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    # Parse command line
    with open(args.mpfa, 'r') if args.mpfa != '-' else sys.stdin as inf:
        # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
        cluster_map = build_cluster_map(inf, bread=args.bread)
        intype = str(args.input).split('.')[-1]
        insize = os.stat(args.input).st_size
        with open(args.input, 'r') as inf2:
            with open(args.output,
                      'w') if args.output != '-' else sys.stdout as outf:
                if not args.pieces:
                    outdf = cluster_completeness(intype, cluster_map, inf2)
                    outdf = outdf.round(decimals=3)
                    outdf.to_csv(outf)
                else:
                    print('\nOk, processing input file in pieces...\n')
                    inkey = generate_index_list(inf2)
                    c_list = list(cluster_map.keys())
                    ct = len(c_list)
                    print('Found %d clusters...' % ct)
                    mat = np.zeros(
                        (ct, ct)
                    )  # initializes an array of the dimensions necessary to fit all cluster results
                    j = 0
                    for cluster in c_list:
                        grab = pick_a_cluster(inkey, cluster)
                        # print(grab)
                        with open(args.input, 'r') as inf3:
                            mat = big_cluster_completeness(
                                grab, inkey, cluster, cluster_map, c_list,
                                inf3, mat, j)
                        # print(mat)
                        j += 1
                    print('File processing complete; writing output file...\n')
                    outdf = pd.DataFrame(mat, dtype=float)
                    outdf.columns = c_list  # names the columns (and index, next line) according to clusters in the order they were processed
                    outdf.index = c_list
                    outdf.sort_index(axis=0, inplace=True)
                    outdf.sort_index(axis=1, inplace=True)
                    outdf = outdf.round(decimals=3)
                    outdf.to_csv(outf)
Example #7
0
def main():
    parser = make_arg_parser()
    args = parser.parse_args()
    # Parse command line
    with open(args.mpfa, 'r') as inf:
        # Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
        cluster_map = build_cluster_map(inf, bread=args.bread)
    with open(args.input, 'r') as in_csv:
        with open(args.output,
                  'w') if args.output != '-' else sys.stdout as outf:
            print('\nOk, processing input file in pieces...\n')
            inkey = generate_index_list(in_csv)
            c_list = list(cluster_map.keys())
            ct = len(c_list)
            print('Found %d clusters...' % ct)
            results_list = []
            j = 0
            for cluster in c_list:
                grab = pick_a_cluster(
                    inkey, cluster
                )  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
                # print(grab)
                with open(args.input, 'r') as inf3:
                    bigmat = big_cluster_v_cluster(inf3, grab, inkey, c_list,
                                                   j)
                # print(bigmat)
                results_list.append(
                    bigmat
                )  # returns a list of dataframes, one for each cluster column
                j += 1
            print('File processing complete; writing output file...\n')
            outdf = pd.concat(results_list, axis=1)
            outdf.columns = c_list  # names the columns (and index, next line) according to clusters in the order they were processed
            outdf.index = c_list
            outdf.sort_index(axis=0, inplace=True)
            outdf.sort_index(axis=1, inplace=True)
            outdf = outdf.round(decimals=2)
            outdf.to_csv(outf)