Beispiel #1
0
def generate_job(func,category,inputs=None,batch_num=1):
    '''
    generate_job
    Parameters
    ==========
    func: str
        name of function to call in plugin functions.py
    category: str
        must be one of "terms" or "corpus" or "relations" corresponding to output folder
    inputs: dict
        key should be arg name, and value should be list of string args as input to func
        If inputs are not specified, it is assumed that the function will be called once
        with no inputs.
    batch_num: int
        the number of jobs to package into one job. For example, batch_num=100 will run
        func with 100 of the input items specified. Each is still written to its own
        output file.
    '''
    # Get name of calling plugin
    home = wordfish_home()
    cf = inspect.currentframe()    
    caller = inspect.getouterframes(cf, 2)
    tag = os.path.dirname(caller[1][1]).split("/")[-1]
    script = "wordfish.plugins.%s.functions" %(tag)
    output_dir = ' output_dir="%s/%s/%s"' %(home,category,tag) 

    # script name to add jobs to
    extraction_script = "%s/scripts/run_extractions_%s.job" %(home,tag)

    lines_to_add = []      
    if category in ["corpus","terms","relations"]:
        if inputs == None:
            lines_to_add.append("python -c 'from %s import %s; %s(%s)'" %(script,func,func,output_dir))
        else:
            formatted_inputs = ""
            # First collect all string args - this means same for all scripts
            for varname,elements in inputs.iteritems():
                if isinstance(elements,str):
                    single_input = format_single_input(varname,elements)
                    formatted_inputs = "%s%s" %(formatted_inputs,single_input)
                          
            # Now collect lists, must be equal length
            input_lists = dict()
            for varname,elements in inputs.iteritems():
                if isinstance(elements,list):
                    if len(input_lists)>0:
                        if len(input_lists.values()[0]) == len(elements):    
                            input_lists[varname] = elements
                    else:
                        input_lists[varname] = elements

            # If we have no input lists, just write the job with single args
            if len(input_lists) == 0:
                formatted_inputs = formatted_inputs.strip(",")
                lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_inputs))
            else:
                N = len(input_lists.values()[0])
                iters = int(numpy.ceil(N/float(batch_num)))
                start = 0
                for i in range(1,iters+1):
                    formatted_instance = formatted_inputs
                    if i==N:
                        end = N
                    else:
                        end = i*batch_num
                    for varname,elements in input_lists.iteritems():
                        new_input = format_inputs(varname,elements[start:end])
                        formatted_instance = "%s%s" %(formatted_instance,new_input)
                    start = end
                    formatted_instance.strip(",")
                    lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_instance))

        # Add lines
        add_lines(script=extraction_script,lines_to_add=lines_to_add)
Beispiel #2
0
def generate_job(func, category, inputs=None, batch_num=1):
    '''
    generate_job
    Parameters
    ==========
    func: str
        name of function to call in plugin functions.py
    category: str
        must be one of "terms" or "corpus" or "relations" corresponding to output folder
    inputs: dict
        key should be arg name, and value should be list of string args as input to func
        If inputs are not specified, it is assumed that the function will be called once
        with no inputs.
    batch_num: int
        the number of jobs to package into one job. For example, batch_num=100 will run
        func with 100 of the input items specified. Each is still written to its own
        output file.
    '''
    # Get name of calling plugin
    home = wordfish_home()
    cf = inspect.currentframe()
    caller = inspect.getouterframes(cf, 2)
    tag = os.path.dirname(caller[1][1]).split("/")[-1]
    script = "wordfish.plugins.%s.functions" % (tag)
    output_dir = ' output_dir="%s/%s/%s"' % (home, category, tag)

    # script name to add jobs to
    extraction_script = "%s/scripts/run_extractions_%s.job" % (home, tag)

    lines_to_add = []
    if category in ["corpus", "terms", "relations"]:
        if inputs == None:
            lines_to_add.append("python -c 'from %s import %s; %s(%s)'" %
                                (script, func, func, output_dir))
        else:
            formatted_inputs = ""
            # First collect all string args - this means same for all scripts
            for varname, elements in inputs.iteritems():
                if isinstance(elements, str):
                    single_input = format_single_input(varname, elements)
                    formatted_inputs = "%s%s" % (formatted_inputs,
                                                 single_input)

            # Now collect lists, must be equal length
            input_lists = dict()
            for varname, elements in inputs.iteritems():
                if isinstance(elements, list):
                    if len(input_lists) > 0:
                        if len(input_lists.values()[0]) == len(elements):
                            input_lists[varname] = elements
                    else:
                        input_lists[varname] = elements

            # If we have no input lists, just write the job with single args
            if len(input_lists) == 0:
                formatted_inputs = formatted_inputs.strip(",")
                lines_to_add.append(
                    "python -c 'from %s import %s; %s(%s,%s)'" %
                    (script, func, func, output_dir, formatted_inputs))
            else:
                N = len(input_lists.values()[0])
                iters = int(numpy.ceil(N / float(batch_num)))
                start = 0
                for i in range(1, iters + 1):
                    formatted_instance = formatted_inputs
                    if i == N:
                        end = N
                    else:
                        end = i * batch_num
                    for varname, elements in input_lists.iteritems():
                        new_input = format_inputs(varname, elements[start:end])
                        formatted_instance = "%s%s" % (formatted_instance,
                                                       new_input)
                    start = end
                    formatted_instance.strip(",")
                    lines_to_add.append(
                        "python -c 'from %s import %s; %s(%s,%s)'" %
                        (script, func, func, output_dir, formatted_instance))

        # Add lines
        add_lines(script=extraction_script, lines_to_add=lines_to_add)
Beispiel #3
0
def go_fish(tag,extraction_script):
    line_to_add = "python -c 'from wordfish.plugins.%s.functions import go_fish; go_fish()'" %(tag)
    add_lines(script=extraction_script,lines_to_add=[line_to_add])
Beispiel #4
0
def go_fish(tag, extraction_script):
    line_to_add = "python -c 'from wordfish.plugins.%s.functions import go_fish; go_fish()'" % (
        tag)
    add_lines(script=extraction_script, lines_to_add=[line_to_add])