def generate_job(func,category,inputs=None,batch_num=1): ''' generate_job Parameters ========== func: str name of function to call in plugin functions.py category: str must be one of "terms" or "corpus" or "relations" corresponding to output folder inputs: dict key should be arg name, and value should be list of string args as input to func If inputs are not specified, it is assumed that the function will be called once with no inputs. batch_num: int the number of jobs to package into one job. For example, batch_num=100 will run func with 100 of the input items specified. Each is still written to its own output file. ''' # Get name of calling plugin home = wordfish_home() cf = inspect.currentframe() caller = inspect.getouterframes(cf, 2) tag = os.path.dirname(caller[1][1]).split("/")[-1] script = "wordfish.plugins.%s.functions" %(tag) output_dir = ' output_dir="%s/%s/%s"' %(home,category,tag) # script name to add jobs to extraction_script = "%s/scripts/run_extractions_%s.job" %(home,tag) lines_to_add = [] if category in ["corpus","terms","relations"]: if inputs == None: lines_to_add.append("python -c 'from %s import %s; %s(%s)'" %(script,func,func,output_dir)) else: formatted_inputs = "" # First collect all string args - this means same for all scripts for varname,elements in inputs.iteritems(): if isinstance(elements,str): single_input = format_single_input(varname,elements) formatted_inputs = "%s%s" %(formatted_inputs,single_input) # Now collect lists, must be equal length input_lists = dict() for varname,elements in inputs.iteritems(): if isinstance(elements,list): if len(input_lists)>0: if len(input_lists.values()[0]) == len(elements): input_lists[varname] = elements else: input_lists[varname] = elements # If we have no input lists, just write the job with single args if len(input_lists) == 0: formatted_inputs = formatted_inputs.strip(",") lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_inputs)) else: N = len(input_lists.values()[0]) iters = int(numpy.ceil(N/float(batch_num))) start = 0 for i in range(1,iters+1): formatted_instance = formatted_inputs if i==N: end = N else: end = i*batch_num for varname,elements in input_lists.iteritems(): new_input = format_inputs(varname,elements[start:end]) formatted_instance = "%s%s" %(formatted_instance,new_input) start = end formatted_instance.strip(",") lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_instance)) # Add lines add_lines(script=extraction_script,lines_to_add=lines_to_add)
def generate_job(func, category, inputs=None, batch_num=1): ''' generate_job Parameters ========== func: str name of function to call in plugin functions.py category: str must be one of "terms" or "corpus" or "relations" corresponding to output folder inputs: dict key should be arg name, and value should be list of string args as input to func If inputs are not specified, it is assumed that the function will be called once with no inputs. batch_num: int the number of jobs to package into one job. For example, batch_num=100 will run func with 100 of the input items specified. Each is still written to its own output file. ''' # Get name of calling plugin home = wordfish_home() cf = inspect.currentframe() caller = inspect.getouterframes(cf, 2) tag = os.path.dirname(caller[1][1]).split("/")[-1] script = "wordfish.plugins.%s.functions" % (tag) output_dir = ' output_dir="%s/%s/%s"' % (home, category, tag) # script name to add jobs to extraction_script = "%s/scripts/run_extractions_%s.job" % (home, tag) lines_to_add = [] if category in ["corpus", "terms", "relations"]: if inputs == None: lines_to_add.append("python -c 'from %s import %s; %s(%s)'" % (script, func, func, output_dir)) else: formatted_inputs = "" # First collect all string args - this means same for all scripts for varname, elements in inputs.iteritems(): if isinstance(elements, str): single_input = format_single_input(varname, elements) formatted_inputs = "%s%s" % (formatted_inputs, single_input) # Now collect lists, must be equal length input_lists = dict() for varname, elements in inputs.iteritems(): if isinstance(elements, list): if len(input_lists) > 0: if len(input_lists.values()[0]) == len(elements): input_lists[varname] = elements else: input_lists[varname] = elements # If we have no input lists, just write the job with single args if len(input_lists) == 0: formatted_inputs = formatted_inputs.strip(",") lines_to_add.append( "python -c 'from %s import %s; %s(%s,%s)'" % (script, func, func, output_dir, formatted_inputs)) else: N = len(input_lists.values()[0]) iters = int(numpy.ceil(N / float(batch_num))) start = 0 for i in range(1, iters + 1): formatted_instance = formatted_inputs if i == N: end = N else: end = i * batch_num for varname, elements in input_lists.iteritems(): new_input = format_inputs(varname, elements[start:end]) formatted_instance = "%s%s" % (formatted_instance, new_input) start = end formatted_instance.strip(",") lines_to_add.append( "python -c 'from %s import %s; %s(%s,%s)'" % (script, func, func, output_dir, formatted_instance)) # Add lines add_lines(script=extraction_script, lines_to_add=lines_to_add)
import urllib2 import pandas import pickle import numpy import re import os import sys # IMPORTS FOR ALL PLUGINS from wordfish.corpus import save_sentences from wordfish.terms import save_terms from wordfish.terms import save_relations from wordfish.plugin import generate_job from wordfish.utils import wordfish_home home = wordfish_home() # REQUIRED WORDFISH FUNCTION def go_fish(): f,d = download_data() features = pandas.read_csv(f,sep="\t") database = pandas.read_csv(d,sep="\t") pmids = database.id.unique().tolist() print "NeuroSynth database has %s unique PMIDs" %(len(pmids)) # Generate brain maps to extract relationships with terms = features.columns.tolist() terms.pop(0) #pmid maps_dir = "%s/terms/neurosynth/maps" %(home)
#!/usr/bin/python # IMPORTS ######################################################################### import os import sys from wordfish.vm import init_scripts, make_plugin_folders from wordfish.utils import make_directory, wordfish_home from wordfish.terms import download_nltk # DIRECTORIES ##################################################################### analysis_dir = wordfish_home() corpus_output = make_directory("%s/corpus" %(analysis_dir)) terms_output = make_directory("%s/terms" %(analysis_dir)) relations_output = make_directory("%s/relations" %(analysis_dir)) scripts_directory = make_directory("%s/scripts" %(analysis_dir)) # INIT FUNCTIONS ################################################################## # These are fine to re-run, if already done will not cause harm download_nltk() make_plugin_folders(analysis_dir) init_scripts(scripts_directory,analysis_dir) print "\n\n\n################################ WORDFISH ################################\n\nApplication at %s\n\nScripts are generated in scripts folder. First open run_slurm.py to check that the parameters are ok. Then you will run a script to generate jobs for each plugin:\n\n python run_slurm.py run_first.job\n\nThis will generate a list of commands to be run for each plugin that you have selected, and you should submit the list to your cluster (or run with launch) to complete all the extractions:\n\npython run_slurm.py run_extractions_reddit.py.\n\nLaunch is recommended as another method, see: https://github.com/vsoch/poldracklab-launch\n\nOnce corpus, terms, and relationships are extracted, you can run run_analysis.py [NOT YET DEVELOPED]." %analysis_dir