Ejemplo n.º 1
0
def process(input_data, processor, max_refresh_delay=0.3):
    # will use multiprocessing to parallelize parsing

    queue = JoinableQueue()
    results_queue = Queue()

    # populate queue, reserve place for results
    results = []
    for i, data in enumerate(input_data):
        queue.put((i, data))
        results.append(None)

    p = Progress(len(input_data), estimate=True, values=True) # output progress bar

    # define jobs
    count = Value('i', 0)
    num_threads = cpu_count()
    sync_count = len(input_data)/1000/num_threads

    print 'Starting %i jobs ...' % num_threads

    jobs = [Process(target=worker, args=(processor, queue, results_queue, count, sync_count)) for i in range(num_threads)]

    try:
        # start jobs
        for job in jobs:
            job.start()

        # gathering results from jobs
        total_count = 0
        while total_count < len(input_data):
            try:
                item = results_queue.get(True, max_refresh_delay)   # timeout delay small enough to update progress bar, see below
                results[item[0]] = item[1]
                total_count += 1
            except Empty:
                pass
            p.set(count.value)  # even if no results are received (cached somewhere), the counter will be updated after get() timeout above
            # NOTE: There might be a slight delay after reaching 100%, because the finished results counter is ahead of received results counter;
            # will stay at 100% until all results are received.

        p.set(total_count)
        p.complete()

        # wait for jobs to finish
        queue.join()
        for job in jobs:
            job.join()

    except KeyboardInterrupt:
        print >> sys.stderr, '\nInterrupted, aborting'
        os.abort() # abort instead of exit so that multiprocessing won't wait

    return results
    def batch_gradient_descent(self):
        if self.learning_rate is None or self.regularized_factor is None:
            return False

        total_iteration = 1500
        progress = Progress('Gradient Descent', total_iteration)

        log = []
        current_iteration = 1
        while current_iteration <= total_iteration:
            progress.report(current_iteration, self.cost)

            # ==> Compute partial derivatives
            # Derivative of cost function wrt movie features
            dj_dmovies = dict()
            for movie_id in self.movies:
                movie = self.movies[movie_id]
                n = len(movie.feature)
                dj_dmovies[movie.id] = []
                for k in range(0, n):
                    dj_dmovies[movie.id].append(
                        self.dj_wrt_movie_feature_k(movie, k))

            # Derivative of cost function wrt user preferences
            dj_dusers = dict()
            for user_id in self.users:
                user = self.users[user_id]
                n = len(user.theta)
                dj_dusers[user.id] = []
                for k in range(0, n):
                    dj_dusers[user.id].append(self.dj_wrt_user_theta_k(
                        user, k))

            # Apply gradient_descent
            for movie_id in dj_dmovies:
                dj_dfeature = dj_dmovies[movie_id]
                movie = self.movies[movie_id]
                n = len(movie.feature)
                for k in range(0, n):
                    movie.feature[k] = movie.feature[k] - (self.learning_rate *
                                                           dj_dfeature[k])

            for user_id in dj_dusers:
                dj_dtheta = dj_dusers[user_id]
                user = self.users[user_id]
                n = len(user.theta)
                for k in range(0, n):
                    user.theta[k] = user.theta[k] - (self.learning_rate *
                                                     dj_dtheta[k])

            current_iteration += 1
        progress.complete()
        return log
Ejemplo n.º 3
0
    def content_based_batch_gradient_descent(self):
        if self.learning_rate is None or self.regularized_factor is None:
            return False

        total_iteration = 1000
        progress = Progress('Content-based Gradient Descent', total_iteration)

        log = []
        current_iteration = 1
        while current_iteration <= total_iteration:
            progress.report(current_iteration, self.content_based_cost)

            # ==> Compute partial derivatives
            # Derivative of cost function wrt movie features
            dj_duser = dict()
            for user_id in self.users:
                user = self.users[user_id]
                n = len(user.theta)
                dj_duser[user.id] = []
                for k in range(0, n):
                    if k == 0:
                        dj_duser[user.id].append(
                            self.dj_wrt_user_theta_k0(user))
                    else:
                        dj_duser[user.id].append(
                            self.dj_wrt_user_theta_k(user, k))

            # Apply gradient descent
            for user_id in dj_duser:
                dj_dtheta = dj_duser[user_id]
                user = self.users[user_id]
                n = len(user.theta)
                for k in range(0, n):
                    user.theta[k] = user.theta[k] - (self.learning_rate *
                                                     dj_dtheta[k])

            current_iteration += 1
        progress.complete()
        return log
Ejemplo n.º 4
0
    def parse(self,sent_filename):
        """
        use Charniak parser to parse sentences then convert results to Stanford Dependency
        """
        rrp = CharniakParser.parser
        print "Begin Charniak parsing ..."
        parsed_filename = sent_filename+'.charniak.parse'
        parsed_trees = ''

        # will use multiprocessing to parallelize parsing

        queue = JoinableQueue()
        results = Queue()

        print 'Reading', sent_filename, '...'
        data = []
        with open(sent_filename,'rb') as f:
            for line in f:
                l = line.decode('utf8', errors='ignore')
                queue.put((len(data), l))
                data.append('')

        p = Progress(len(data), estimate=True, values=True) # output progress bar

        # define jobs
        count = Value('i', 0)
        num_threads = cpu_count()
        sync_count = len(data)/1000/num_threads

        print 'Starting %i jobs ...' % num_threads

        jobs = [Process(target=parse_queue, args=(rrp, queue, results, count, sync_count, p if i == -1 else None, i)) for i in range(num_threads)]

        try:
            # start jobs
            for job in jobs:
                job.start()

            # gathering results from jobs
            total_count = 0
            while total_count < len(data):
                try:
                    item = results.get(True, 0.3)   # timeout delay small enough to update progress bar, see below
                    data[item[0]] = item[1]
                    total_count += 1
                except Empty:
                    pass
                p.set(count.value)  # even if no results are received (cached somewhere), the counter will be updated after get() timeout above
                # NOTE: There might be a slight delay after reaching 100%, because the finished results counter is ahead of received results counter;
                # will stay at 100% until all results are received.

            p.set(total_count)
            p.complete()

            # wait for jobs to finish
            queue.join()
            for job in jobs:
                job.join()

        except KeyboardInterrupt:
            print >> sys.stderr, '\nInterrupted, aborting'
            os.abort() # abort instead of exit so that multiprocessing won't wait

        print 'Writing', parsed_filename, '...'
        with open(parsed_filename, 'w') as f:
            for item in data:
                print >> f, item

        # convert parse tree to dependency tree
        print "Convert Charniak parse tree to Stanford Dependency tree ..."
        subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
Ejemplo n.º 5
0
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True, align=True, use_amr_tokens=False):
    '''nasty function'''
    tmp_sent_filename = None
    instances = None
    tok_sent_filename = None
    
    if INPUT_AMR: # the input file is amr annotation

        amr_file = input_file
        if amr_file.endswith('.amr'):
            aligned_amr_file = amr_file + '.tok.aligned'
            amr_tok_file = amr_file + '.tok'
        else:
            aligned_amr_file = amr_file + '.amr.tok.aligned'
            amr_tok_file = amr_file + '.amr.tok'

        tmp_sent_filename = amr_file+'.sent'
        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file

        comments,amr_strings = readAMR(amr_file)
        if os.path.exists(aligned_amr_file):
            print "Reading aligned AMR ..."
            # read aligned amr and transfer alignment comments
            comments_with_alignment,_ = readAMR(aligned_amr_file)
            for comment,comment_with_alignment in zip(comments,comments_with_alignment):
                comment['alignments'] = comment_with_alignment['alignments']

        tokenized_sentences = None
        try:
            if use_amr_tokens:
                tokenized_sentences = [c['tok'] for c in comments] # here should be 'snt'
                if not os.path.exists(tok_sent_filename):
                    with open(tok_sent_filename,'w') as f:
                        for sentence in tokenized_sentences:
                            print >> f, sentence
                if tokenized_sentences:
                    print >> log, "AMR has tokens, will use them"
        except:
            raise
            pass

        sentences = [c['snt'] for c in comments] # here should be 'snt'
        if not os.path.exists(tmp_sent_filename): # write sentences into file
            _write_sentences(tmp_sent_filename,sentences)

        print >> log, "Start Stanford CoreNLP..."
        proc1 = StanfordCoreNLP(tokenize=not tokenized_sentences)

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()

        instances = proc1.parse(tmp_sent_filename if proc1.tokenize else tok_sent_filename)

        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)

        if len(instances) == 0:
            print 'Error: no instances!'
            sys.exit(1)

        if not os.path.exists(amr_tok_file): # write tokenized amr file
            _write_tok_amr(amr_tok_file,amr_file,instances)
            
        if not os.path.exists(aligned_amr_file) and align:
            # align
            print "Call JAMR to generate alignment ..."
            subprocess.call('./scripts/jamr_align.sh '+amr_tok_file,shell=True)
            print "Reading aligned AMR ..."
            # read aligned amr and transfer alignment comments
            comments_with_alignment,_ = readAMR(aligned_amr_file)
            for comment,comment_with_alignment in zip(comments,comments_with_alignment):
                comment['alignments'] = comment_with_alignment['alignments']

        from progress import Progress
        p = Progress(len(instances), estimate=True, values=True)
        print 'Parsing AMR:'
        SpanGraph.graphID = 0
        for i in range(len(instances)):

            amr = AMR.parse_string(amr_strings[i])
            if 'alignments' in comments[i]:
                alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments'])
                #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens)
                ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens)
                #ggraph.pre_merge_netag(instances[i])
                #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples())
                instances[i].addAMR(amr)
                instances[i].addGoldGraph(ggraph)
            instances[i].addComment(comments[i])
            p += 1
        p.complete()

    else:
        # input file is sentence
        tmp_sent_filename = input_file 

        print >> log, "Start Stanford CoreNLP ..."
        proc1 = StanfordCoreNLP()

        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
        if START_SNLP: proc1.setup()
        instances = proc1.parse(tmp_sent_filename)

        tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
        if not os.path.exists(tok_sent_filename):
            _write_tok_sentences(tok_sent_filename,instances)
        
    # preprocess 2: dependency parsing 
    if constants.FLAG_DEPPARSER == "stanford":
        dep_filename = tok_sent_filename+'.stanford.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = StanfordDepParser()
            dep_result = dparser.parse(tok_sent_filename)
            output_dep = open(dep_filename,'w')            
            output_dep.write(dep_result)
            output_dep.close()
            
        _add_dependency(instances,dep_result)
    elif constants.FLAG_DEPPARSER == "stanfordConvert":
        dep_filename = tok_sent_filename+'.stanford.parse.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)

            dep_result = open(dep_filename,'r').read()
        else:
            raise IOError('Converted dependency file %s not founded' % (dep_filename))

        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "stdconv+charniak":
        dep_filename = tok_sent_filename+'.charniak.parse.dep'
        if not os.path.exists(dep_filename):
            dparser = CharniakParser()
            dparser.parse(tok_sent_filename)
            #raise IOError('Converted dependency file %s not founded' % (dep_filename))
        print 'Read dependency file %s...' % (dep_filename)
        dep_result = open(dep_filename,'r').read()
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
            
    elif constants.FLAG_DEPPARSER == "clear":
        dep_filename = tok_sent_filename+'.clear.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = ClearDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "turbo":
        dep_filename = tok_sent_filename+'.turbo.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = TurboDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)

    elif constants.FLAG_DEPPARSER == "mate":
        dep_filename = tok_sent_filename+'.mate.dep'
        if os.path.exists(dep_filename):
            print 'Read dependency file %s...' % (dep_filename)                                                                 
            dep_result = open(dep_filename,'r').read()
        else:
            dparser = MateDepParser()
            dep_result = dparser.parse(tok_sent_filename)
        _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER)
    else:
        pass
    
    if constants.FLAG_PROP:
        print >> log, "Adding SRL information..."
        prop_filename = tok_sent_filename + '.prop'
        if os.path.exists(prop_filename):
            if constants.FLAG_DEPPARSER == "stdconv+charniak":
                _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True)
            else:
                _add_prop(instances,prop_filename,dep_filename)
            
        else:
            raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename))

        
    return instances