def transcribe(input_file_url): """ Transcribes an audio file. The audio file format is a URL. Params: input_file_url (string): The input audio file URL """ # Setting up environment if not utilities.check_env_vars(): return utilities.create_ancillary_folders() # download the podcast file filepath = get_podcast_file(input_file_url) # convert file to raw audio chunks chunks = convert_to_raw_audio_chunks(filepath) # transcribe chunks transcriber = Transcriber(os.environ['GOOGLE_API_KEY']) transcript = transcriber.transcribe_many(chunks) # write to the output file output_file_name = os.path.split(filepath)[-1] utilities.write_output_file(output_file_name, transcript) print "Cleaning up...\n" utilities.cleanup()
def add_to_db(Session, k): session = Session() t = dt.date.today() adds = 0 cleantracks = k for line in cleantracks: clean_name = cleanup(line[0]) n_ = band(name=line[0], song=line[1], album = line[2], release_year = line[3], source=line[4], dateplayed=line[5], dateadded=t, cleanname=clean_name) q = session.query(band).filter(band.name == n_.name, band.song == n_.song, band.source == n_.source) if q.first() == None: session.add(n_) adds += 1 else: try: print ('Already had {0} - {1}'.format(n_.name, n_.song)) except: print ('Already had it. Cannot print. ID is {0}'.format(q.first().id)) session.commit() return adds
def load_to_db(albumlist): socket.setdefaulttimeout(15) # creation of the SQL database and the "session" object that is used to manage # communications with the database engine = create_engine('sqlite:///../../databases/scout.db') session_factory = sessionmaker(bind=engine) Session = scoped_session(session_factory) metadata = MetaData(db) db.metadata.create_all(engine) session = Session() t = dt.date.today() adds = 0 for i in albumlist: print (i) clean_name = cleanup(i[0]) n_ = band(name=i[0], album=i[1], source='KEXP Countdown 2018', appeared ='KEXP Countdown 2018', dateadded=t, cleanname=clean_name) q = session.query(band).filter(band.name == n_.name, band.song == n_.song) if q.first() == None: session.add(n_) adds += 1 else: try: print ('Already had {0} - {1}'.format(n_.name, n_.song)) except: print ('Already had it. Cannot print. ID is {0}'.format(q.first().id)) session.commit() print ('Added {0} songs'.format(adds))
def load_other_bands(Session, choices): # this loop pulls down band names from the sources identified. # each source, however, needs its own function, since the # websites are set up differently proceed = False for src in choices: if src in bandsources: # at least one of the choices needs to be in bandsources # or else this function doesn't need to proceed proceed = True if proceed == False: return session = Session() today = dt.date.today() other_sources = [ 'KEXP Music That Matters', 'Pitchfork Top Tracks', 'Stereogum', 'Metacritic', 'KCRW', 'Pitchfork', 'KEXP charts', 'KNKX' ] other_choices = [x for x in choices if x in other_sources] for src in other_choices: print('\n\n\n', src) try: list = grabbands(src) except Exception as e: print('{0} not provided for in load_other_bands.grabbands'.format( src)) print(str(e)) add_count = 0 for i in list: h = cleanup(i.name) if h != '': try: q = session.query(band).filter(band.cleanname == h) if q.first() == None: i.source = src i.cleanname = h i.dateadded = today print("Adding {0} (from {1})".format(i.name, i.source)) session.add(i) session.commit() add_count += 1 except Exception as e: print(str(e)) print('Added {0} entries to the {1} collection. \n\n'.format( add_count, src)) return
msg['content']['url']) for category in msg['categories']: if category not in pre_payload: pre_payload[category] = [] pre_payload[category].append(msg) payload = [dblock()] for category in pre_payload: payload.append(cblock(category)) pre_payload[category].sort(key=lambda x: x['type']) types = set([t['type'] for t in pre_payload[category]]) for t in types: payload.append(tblock(t)) payload.append( iblock( [msg for msg in pre_payload[category] if msg['type'] == t], category, t)) payload.append(dblock()) ec.parse() slacky.send_message(blocks=payload, channel_id=channel_id) slacky.delete_set_messages(messages, channel_id=channel_id) ut.cleanup()
def similar(a, b): a = cleanup(a) b = cleanup(b) return SequenceMatcher(None, a, b).ratio()
def main(argv=sys.argv[1:]): parser = OptionParser(usage=USAGE, version="Version: " + __version__) # Required arguments requiredOptions = OptionGroup( parser, "Required options", "These options are required to run BinGeR, and may be supplied in any order." ) requiredOptions.add_option( "-l", "--sample_list", type="string", metavar="FILE", help="Text file containing all sample names, one per line") requiredOptions.add_option( "-o", "--out_dir", type="string", metavar="OUTDIR", help= "Working directory where the results and intermediate files will be stored at" ) parser.add_option_group(requiredOptions) # Optional arguments that need to be supplied if not the same as default optOptions = OptionGroup( parser, "Optional parameters", "There options are optional, and may be supplied in any order.") optOptions.add_option( "-b", "--bams_dir", type="string", default="Bams", metavar="DIR", help= "Directory where sorted bam files (reads versus assembly, same sample) are, the naming should follow \"sample.*.bam\" convention. [Default: ./Bams]" ) optOptions.add_option( "-c", "--coverage_dir", type="string", default="Coverage", metavar="DIR", help= "Directory where coverage files are, naming follows \"sampleA.vs.sampleB.*.coverage\" convention. [Default: ./Coverage]" ) optOptions.add_option( "-a", "--assemblies_dir", type="string", default="Assemblies", metavar="DIR", help= "Directory where assemblies in fasta format are, naming follows \"sample.*.fa\" convention. [Default: ./Assemblies]" ) optOptions.add_option( "-z", "--zscore_dir", type="string", default="ZScores", metavar="DIR", help= "Directory where oligo-nt z-score files are, naming follows \"sample.*.ZScore\" convention. [Default: ./ZScore]" ) optOptions.add_option( "-s", "--hmmscan_dir", type="string", default="HMMScan", metavar="DIR", help= "Directory where hmmscan files are, naming follows \"sample.*.hmmscan\" convention. [Default: ./HMMScan]" ) optOptions.add_option( "-t", "--num_proc", type="int", default=1, metavar='INT', help="Number of processor for BinGeR to use [default: 1].") optOptions.add_option("--blat", type="string", default="blat", help="Path to blat, specify if not in env.") parser.add_option_group(optOptions) # Binning parameters that could fine tune the process clusteringOptions = OptionGroup( parser, "Binning parameters", "There options are optional, and may be supplied in any order.") clusteringOptions.add_option( "-m", "--min_core", type="int", default=1e5, metavar='INT', help="Minimum size to consider as bin core [default: 1e5].") clusteringOptions.add_option( "-u", "--cov_clustering_min_length", dest="minCovLength", type="int", default=1500, metavar='INT', help= "Minimum contig length to be considered in coverage clustering [default: 1500]." ) clusteringOptions.add_option( "--min_cov_corrcoef", dest="minCovCorrceof", type="float", default=0.95, metavar='FLOAT', help= "Minimum correlation coefficient cutoff for form a link between contigs using coverage profiles [default: 0.95]." ) clusteringOptions.add_option( "--min_zscore_corrcoef", dest="minZScoreCorrceof", type="float", default=0.95, metavar='FLOAT', help= "Minimum correlation coefficient cutoff for form a link between contigs using tri-/tetra-nt frequency Z-Score [default: 0.90]." ) clusteringOptions.add_option( "-x", "--zscore_clustering_min_length", dest="minZLength", type="int", default=3000, metavar='INT', help= "Minimum contig length to be considered in Z-score clustering [default: 2000]." ) clusteringOptions.add_option( "-d", "--cpr_alpha", type="float", default=0.9, metavar='FLOAT', help= "The dampening factor, alpha, in community personalized PageRank [default: 0.9, range: (0.75, 0.95)]." ) clusteringOptions.add_option( "-e", "--cpr_tol", type="float", default=1e-4, metavar='FLOAT', help= "The error tolerance factor, tol, in community personalized PageRank [default: 1e-4, range: (1e-8, 1e-2)]." ) clusteringOptions.add_option( "-i", "--cpr_maxiter", type="int", default=50, metavar='INT', help= "The max iterations performed in community personalized PageRank [default: 50, range: (20, 100)]." ) parser.add_option_group(clusteringOptions) # runtime settings that could affect the file saving and message printing runtimeSettings = OptionGroup( parser, "Runtime settings", "There options are optional, and may be supplied in any order.") runtimeSettings.add_option( "-q", "--quiet", default=False, action="store_true", help= "Suppress printing detailed runtime information, only important messages will show [default: False]." ) runtimeSettings.add_option( "--no_intermediates", action="store_false", dest="save_intermediates", default=True, help= "Do no save intermediate files during runtime [default: True (save intermediates)]." ) parser.add_option_group(runtimeSettings) (options, args) = parser.parse_args(argv) if options.sample_list is None: parser.error("A list of samples and a working directory are required!") exit(0) if options.out_dir is None: parser.error("An output directory is required to supply!") exit(0) if options.num_proc < 1: parser.error( "Number of processors must be integer >= 1, you supplied %i" % options.num_proc) exit(0) if options.min_core < 1e4 or options.min_core > 1e6: parser.error( "Size of minimum bin size must be in range [1e4, 1e6]bp, you supplied %i" % options.min_core) exit(0) if options.cpr_alpha < 0.75 or options.cpr_alpha > 0.95: parser.error( "Community PageRank Alpha must be a float in range [0.75, 0.95], you supplied %.3f" % options.cpr_alpha) exit(0) if options.cpr_tol < 1e-8 or options.cpr_tol > 1e-2: parser.error( "Community PageRank tol must be a float in range [1e-8, 1e-2], you supplied %.3f" % options.cpr_tol) exit(0) if options.cpr_maxiter < 20 or options.cpr_maxiter > 100: parser.error( "Community PageRank tol must be a float in range [20, 100], you supplied %i" % options.cpr_maxiter) exit(0) total_start_time = time() sys.stdout.write("BinGeR started at %s\n" % (ctime())) sys.stdout.flush() # test if blat exists blatTest = Popen(options.blat, shell=True, stdout=PIPE).stdout.read() if blatTest == None or len(blatTest) == 0: sys.stderr.write("FATAL: blat not found in path!") exit(0) # check sanity of the files in required directories projInfo = ProjectInfo() projInfo.initProject(options) if not options.quiet: projInfo.printSamples() # build networkx graph for Project sys.stdout.write('\nInitializing contig space...\n') G = cSpace.ContigSpace(projInfo.samples) initCoresPath = projInfo.out_dir + '/initCores' if os.path.exists(initCoresPath): if len(glob.glob(initCoresPath + '/*.cpickle')) > 0: G.refineCores(projInfo, options) G.recruitContigs(projInfo, options) else: G.initSubgraphs(projInfo, options) G.forgeCores(projInfo, options) G.refineCores(projInfo, options) G.recruitContigs(projInfo, options) # output bins and the evaluation, extract reads of bins for downstream analysis. utilities.outputBins(projInfo, options) # get all the reads for the bins utilities.extractReadsForBins(projInfo, options) # clean up if necessary if not options.save_intermediates: utilities.cleanup(projInfo) total_finish_time = time() sys.stdout.write("BinGeR finished at %s\n" % (ctime())) sys.stdout.flush() return
def main(argv = sys.argv[1:]): parser = OptionParser(usage = USAGE, version="Version: " + __version__) # Required arguments requiredOptions = OptionGroup(parser, "Required options", "These options are required to run BinGeR, and may be supplied in any order.") requiredOptions.add_option("-l", "--sample_list", type = "string", metavar = "FILE", help = "Text file containing all sample names, one per line") requiredOptions.add_option("-o", "--out_dir", type = "string", metavar = "OUTDIR", help = "Working directory where the results and intermediate files will be stored at") parser.add_option_group(requiredOptions) # Optional arguments that need to be supplied if not the same as default optOptions = OptionGroup(parser, "Optional parameters", "There options are optional, and may be supplied in any order.") optOptions.add_option("-b", "--bams_dir", type = "string", default = "Bams", metavar = "DIR", help = "Directory where sorted bam files (reads versus assembly, same sample) are, the naming should follow \"sample.*.bam\" convention. [Default: ./Bams]") optOptions.add_option("-c", "--coverage_dir", type = "string", default = "Coverage", metavar = "DIR", help = "Directory where coverage files are, naming follows \"sampleA.vs.sampleB.*.coverage\" convention. [Default: ./Coverage]") optOptions.add_option("-a", "--assemblies_dir", type = "string", default = "Assemblies", metavar = "DIR", help = "Directory where assemblies in fasta format are, naming follows \"sample.*.fa\" convention. [Default: ./Assemblies]") optOptions.add_option("-z", "--zscore_dir", type = "string", default = "ZScores", metavar = "DIR", help = "Directory where oligo-nt z-score files are, naming follows \"sample.*.ZScore\" convention. [Default: ./ZScore]") optOptions.add_option("-s", "--hmmscan_dir", type = "string", default = "HMMScan", metavar = "DIR", help = "Directory where hmmscan files are, naming follows \"sample.*.hmmscan\" convention. [Default: ./HMMScan]") optOptions.add_option("-t", "--num_proc", type = "int", default = 1, metavar = 'INT', help = "Number of processor for BinGeR to use [default: 1].") optOptions.add_option("--blat", type = "string", default = "blat", help = "Path to blat, specify if not in env.") parser.add_option_group(optOptions) # Binning parameters that could fine tune the process clusteringOptions = OptionGroup(parser, "Binning parameters", "There options are optional, and may be supplied in any order.") clusteringOptions.add_option("-m", "--min_core", type = "int", default = 1e5, metavar = 'INT', help = "Minimum size to consider as bin core [default: 1e5].") clusteringOptions.add_option("-u", "--cov_clustering_min_length", dest = "minCovLength", type = "int", default = 1500, metavar = 'INT', help = "Minimum contig length to be considered in coverage clustering [default: 1500].") clusteringOptions.add_option("--min_cov_corrcoef", dest = "minCovCorrceof", type = "float", default = 0.95, metavar = 'FLOAT', help = "Minimum correlation coefficient cutoff for form a link between contigs using coverage profiles [default: 0.95].") clusteringOptions.add_option("--min_zscore_corrcoef", dest = "minZScoreCorrceof", type = "float", default = 0.95, metavar = 'FLOAT', help = "Minimum correlation coefficient cutoff for form a link between contigs using tri-/tetra-nt frequency Z-Score [default: 0.90].") clusteringOptions.add_option("-x", "--zscore_clustering_min_length", dest = "minZLength", type = "int", default = 3000, metavar = 'INT', help = "Minimum contig length to be considered in Z-score clustering [default: 2000].") clusteringOptions.add_option("-d", "--cpr_alpha", type = "float", default = 0.9, metavar = 'FLOAT', help = "The dampening factor, alpha, in community personalized PageRank [default: 0.9, range: (0.75, 0.95)].") clusteringOptions.add_option("-e", "--cpr_tol", type = "float", default = 1e-4, metavar = 'FLOAT', help = "The error tolerance factor, tol, in community personalized PageRank [default: 1e-4, range: (1e-8, 1e-2)].") clusteringOptions.add_option("-i", "--cpr_maxiter", type = "int", default = 50, metavar = 'INT', help = "The max iterations performed in community personalized PageRank [default: 50, range: (20, 100)].") parser.add_option_group(clusteringOptions) # runtime settings that could affect the file saving and message printing runtimeSettings = OptionGroup(parser, "Runtime settings", "There options are optional, and may be supplied in any order.") runtimeSettings.add_option("-q", "--quiet", default = False, action = "store_true", help = "Suppress printing detailed runtime information, only important messages will show [default: False].") runtimeSettings.add_option("--no_intermediates", action="store_false", dest = "save_intermediates", default = True, help = "Do no save intermediate files during runtime [default: True (save intermediates)].") parser.add_option_group(runtimeSettings) (options, args) = parser.parse_args(argv) if options.sample_list is None: parser.error("A list of samples and a working directory are required!") exit(0) if options.out_dir is None: parser.error("An output directory is required to supply!") exit(0) if options.num_proc < 1: parser.error("Number of processors must be integer >= 1, you supplied %i" % options.num_proc) exit(0) if options.min_core < 1e4 or options.min_core > 1e6: parser.error("Size of minimum bin size must be in range [1e4, 1e6]bp, you supplied %i" % options.min_core) exit(0) if options.cpr_alpha < 0.75 or options.cpr_alpha > 0.95: parser.error("Community PageRank Alpha must be a float in range [0.75, 0.95], you supplied %.3f" % options.cpr_alpha) exit(0) if options.cpr_tol < 1e-8 or options.cpr_tol > 1e-2: parser.error("Community PageRank tol must be a float in range [1e-8, 1e-2], you supplied %.3f" % options.cpr_tol) exit(0) if options.cpr_maxiter < 20 or options.cpr_maxiter > 100: parser.error("Community PageRank tol must be a float in range [20, 100], you supplied %i" % options.cpr_maxiter) exit(0) total_start_time = time() sys.stdout.write("BinGeR started at %s\n"%(ctime())) sys.stdout.flush() # test if blat exists blatTest = Popen(options.blat, shell=True, stdout=PIPE).stdout.read() if blatTest == None or len(blatTest) == 0: sys.stderr.write("FATAL: blat not found in path!") exit(0) # check sanity of the files in required directories projInfo = ProjectInfo() projInfo.initProject(options) if not options.quiet: projInfo.printSamples() # build networkx graph for Project sys.stdout.write('\nInitializing contig space...\n') G = cSpace.ContigSpace(projInfo.samples) initCoresPath = projInfo.out_dir + '/initCores' if os.path.exists(initCoresPath): if len(glob.glob(initCoresPath + '/*.cpickle')) > 0: G.refineCores(projInfo, options) G.recruitContigs(projInfo, options) else: G.initSubgraphs(projInfo, options) G.forgeCores(projInfo, options) G.refineCores(projInfo, options) G.recruitContigs(projInfo, options) # output bins and the evaluation, extract reads of bins for downstream analysis. utilities.outputBins(projInfo, options) # get all the reads for the bins utilities.extractReadsForBins(projInfo, options) # clean up if necessary if not options.save_intermediates: utilities.cleanup(projInfo) total_finish_time = time() sys.stdout.write("BinGeR finished at %s\n"%(ctime())) sys.stdout.flush() return