def process(input_data, processor, max_refresh_delay=0.3): # will use multiprocessing to parallelize parsing queue = JoinableQueue() results_queue = Queue() # populate queue, reserve place for results results = [] for i, data in enumerate(input_data): queue.put((i, data)) results.append(None) p = Progress(len(input_data), estimate=True, values=True) # output progress bar # define jobs count = Value('i', 0) num_threads = cpu_count() sync_count = len(input_data)/1000/num_threads print 'Starting %i jobs ...' % num_threads jobs = [Process(target=worker, args=(processor, queue, results_queue, count, sync_count)) for i in range(num_threads)] try: # start jobs for job in jobs: job.start() # gathering results from jobs total_count = 0 while total_count < len(input_data): try: item = results_queue.get(True, max_refresh_delay) # timeout delay small enough to update progress bar, see below results[item[0]] = item[1] total_count += 1 except Empty: pass p.set(count.value) # even if no results are received (cached somewhere), the counter will be updated after get() timeout above # NOTE: There might be a slight delay after reaching 100%, because the finished results counter is ahead of received results counter; # will stay at 100% until all results are received. p.set(total_count) p.complete() # wait for jobs to finish queue.join() for job in jobs: job.join() except KeyboardInterrupt: print >> sys.stderr, '\nInterrupted, aborting' os.abort() # abort instead of exit so that multiprocessing won't wait return results
def batch_gradient_descent(self): if self.learning_rate is None or self.regularized_factor is None: return False total_iteration = 1500 progress = Progress('Gradient Descent', total_iteration) log = [] current_iteration = 1 while current_iteration <= total_iteration: progress.report(current_iteration, self.cost) # ==> Compute partial derivatives # Derivative of cost function wrt movie features dj_dmovies = dict() for movie_id in self.movies: movie = self.movies[movie_id] n = len(movie.feature) dj_dmovies[movie.id] = [] for k in range(0, n): dj_dmovies[movie.id].append( self.dj_wrt_movie_feature_k(movie, k)) # Derivative of cost function wrt user preferences dj_dusers = dict() for user_id in self.users: user = self.users[user_id] n = len(user.theta) dj_dusers[user.id] = [] for k in range(0, n): dj_dusers[user.id].append(self.dj_wrt_user_theta_k( user, k)) # Apply gradient_descent for movie_id in dj_dmovies: dj_dfeature = dj_dmovies[movie_id] movie = self.movies[movie_id] n = len(movie.feature) for k in range(0, n): movie.feature[k] = movie.feature[k] - (self.learning_rate * dj_dfeature[k]) for user_id in dj_dusers: dj_dtheta = dj_dusers[user_id] user = self.users[user_id] n = len(user.theta) for k in range(0, n): user.theta[k] = user.theta[k] - (self.learning_rate * dj_dtheta[k]) current_iteration += 1 progress.complete() return log
def content_based_batch_gradient_descent(self): if self.learning_rate is None or self.regularized_factor is None: return False total_iteration = 1000 progress = Progress('Content-based Gradient Descent', total_iteration) log = [] current_iteration = 1 while current_iteration <= total_iteration: progress.report(current_iteration, self.content_based_cost) # ==> Compute partial derivatives # Derivative of cost function wrt movie features dj_duser = dict() for user_id in self.users: user = self.users[user_id] n = len(user.theta) dj_duser[user.id] = [] for k in range(0, n): if k == 0: dj_duser[user.id].append( self.dj_wrt_user_theta_k0(user)) else: dj_duser[user.id].append( self.dj_wrt_user_theta_k(user, k)) # Apply gradient descent for user_id in dj_duser: dj_dtheta = dj_duser[user_id] user = self.users[user_id] n = len(user.theta) for k in range(0, n): user.theta[k] = user.theta[k] - (self.learning_rate * dj_dtheta[k]) current_iteration += 1 progress.complete() return log
def parse(self,sent_filename): """ use Charniak parser to parse sentences then convert results to Stanford Dependency """ rrp = CharniakParser.parser print "Begin Charniak parsing ..." parsed_filename = sent_filename+'.charniak.parse' parsed_trees = '' # will use multiprocessing to parallelize parsing queue = JoinableQueue() results = Queue() print 'Reading', sent_filename, '...' data = [] with open(sent_filename,'rb') as f: for line in f: l = line.decode('utf8', errors='ignore') queue.put((len(data), l)) data.append('') p = Progress(len(data), estimate=True, values=True) # output progress bar # define jobs count = Value('i', 0) num_threads = cpu_count() sync_count = len(data)/1000/num_threads print 'Starting %i jobs ...' % num_threads jobs = [Process(target=parse_queue, args=(rrp, queue, results, count, sync_count, p if i == -1 else None, i)) for i in range(num_threads)] try: # start jobs for job in jobs: job.start() # gathering results from jobs total_count = 0 while total_count < len(data): try: item = results.get(True, 0.3) # timeout delay small enough to update progress bar, see below data[item[0]] = item[1] total_count += 1 except Empty: pass p.set(count.value) # even if no results are received (cached somewhere), the counter will be updated after get() timeout above # NOTE: There might be a slight delay after reaching 100%, because the finished results counter is ahead of received results counter; # will stay at 100% until all results are received. p.set(total_count) p.complete() # wait for jobs to finish queue.join() for job in jobs: job.join() except KeyboardInterrupt: print >> sys.stderr, '\nInterrupted, aborting' os.abort() # abort instead of exit so that multiprocessing won't wait print 'Writing', parsed_filename, '...' with open(parsed_filename, 'w') as f: for item in data: print >> f, item # convert parse tree to dependency tree print "Convert Charniak parse tree to Stanford Dependency tree ..." subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
def preprocess(input_file,START_SNLP=True,INPUT_AMR=True, align=True, use_amr_tokens=False): '''nasty function''' tmp_sent_filename = None instances = None tok_sent_filename = None if INPUT_AMR: # the input file is amr annotation amr_file = input_file if amr_file.endswith('.amr'): aligned_amr_file = amr_file + '.tok.aligned' amr_tok_file = amr_file + '.tok' else: aligned_amr_file = amr_file + '.amr.tok.aligned' amr_tok_file = amr_file + '.amr.tok' tmp_sent_filename = amr_file+'.sent' tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file comments,amr_strings = readAMR(amr_file) if os.path.exists(aligned_amr_file): print "Reading aligned AMR ..." # read aligned amr and transfer alignment comments comments_with_alignment,_ = readAMR(aligned_amr_file) for comment,comment_with_alignment in zip(comments,comments_with_alignment): comment['alignments'] = comment_with_alignment['alignments'] tokenized_sentences = None try: if use_amr_tokens: tokenized_sentences = [c['tok'] for c in comments] # here should be 'snt' if not os.path.exists(tok_sent_filename): with open(tok_sent_filename,'w') as f: for sentence in tokenized_sentences: print >> f, sentence if tokenized_sentences: print >> log, "AMR has tokens, will use them" except: raise pass sentences = [c['snt'] for c in comments] # here should be 'snt' if not os.path.exists(tmp_sent_filename): # write sentences into file _write_sentences(tmp_sent_filename,sentences) print >> log, "Start Stanford CoreNLP..." proc1 = StanfordCoreNLP(tokenize=not tokenized_sentences) # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename if proc1.tokenize else tok_sent_filename) if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) if len(instances) == 0: print 'Error: no instances!' sys.exit(1) if not os.path.exists(amr_tok_file): # write tokenized amr file _write_tok_amr(amr_tok_file,amr_file,instances) if not os.path.exists(aligned_amr_file) and align: # align print "Call JAMR to generate alignment ..." subprocess.call('./scripts/jamr_align.sh '+amr_tok_file,shell=True) print "Reading aligned AMR ..." # read aligned amr and transfer alignment comments comments_with_alignment,_ = readAMR(aligned_amr_file) for comment,comment_with_alignment in zip(comments,comments_with_alignment): comment['alignments'] = comment_with_alignment['alignments'] from progress import Progress p = Progress(len(instances), estimate=True, values=True) print 'Parsing AMR:' SpanGraph.graphID = 0 for i in range(len(instances)): amr = AMR.parse_string(amr_strings[i]) if 'alignments' in comments[i]: alignment,s2c_alignment = Aligner.readJAMRAlignment(amr,comments[i]['alignments']) #ggraph = SpanGraph.init_ref_graph(amr,alignment,instances[i].tokens) ggraph = SpanGraph.init_ref_graph_abt(amr,alignment,s2c_alignment,instances[i].tokens) #ggraph.pre_merge_netag(instances[i]) #print >> log, "Graph ID:%s\n%s\n"%(ggraph.graphID,ggraph.print_tuples()) instances[i].addAMR(amr) instances[i].addGoldGraph(ggraph) instances[i].addComment(comments[i]) p += 1 p.complete() else: # input file is sentence tmp_sent_filename = input_file print >> log, "Start Stanford CoreNLP ..." proc1 = StanfordCoreNLP() # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP if START_SNLP: proc1.setup() instances = proc1.parse(tmp_sent_filename) tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file if not os.path.exists(tok_sent_filename): _write_tok_sentences(tok_sent_filename,instances) # preprocess 2: dependency parsing if constants.FLAG_DEPPARSER == "stanford": dep_filename = tok_sent_filename+'.stanford.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = StanfordDepParser() dep_result = dparser.parse(tok_sent_filename) output_dep = open(dep_filename,'w') output_dep.write(dep_result) output_dep.close() _add_dependency(instances,dep_result) elif constants.FLAG_DEPPARSER == "stanfordConvert": dep_filename = tok_sent_filename+'.stanford.parse.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: raise IOError('Converted dependency file %s not founded' % (dep_filename)) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "stdconv+charniak": dep_filename = tok_sent_filename+'.charniak.parse.dep' if not os.path.exists(dep_filename): dparser = CharniakParser() dparser.parse(tok_sent_filename) #raise IOError('Converted dependency file %s not founded' % (dep_filename)) print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "clear": dep_filename = tok_sent_filename+'.clear.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = ClearDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "turbo": dep_filename = tok_sent_filename+'.turbo.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = TurboDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) elif constants.FLAG_DEPPARSER == "mate": dep_filename = tok_sent_filename+'.mate.dep' if os.path.exists(dep_filename): print 'Read dependency file %s...' % (dep_filename) dep_result = open(dep_filename,'r').read() else: dparser = MateDepParser() dep_result = dparser.parse(tok_sent_filename) _add_dependency(instances,dep_result,constants.FLAG_DEPPARSER) else: pass if constants.FLAG_PROP: print >> log, "Adding SRL information..." prop_filename = tok_sent_filename + '.prop' if os.path.exists(prop_filename): if constants.FLAG_DEPPARSER == "stdconv+charniak": _add_prop(instances,prop_filename,dep_filename,FIX_PROP_HEAD=True) else: _add_prop(instances,prop_filename,dep_filename) else: raise FileNotFoundError('Semantic role labeling file %s not found!'%(prop_filename)) return instances