def test_parallel(start): pairs_file = open(os.path.join(os.pardir, "data", "pairs.pkl"), 'rb') pairs = pickle.load(pairs_file) pairs_file.close() synonyms_file = open(os.path.join(os.pardir, "data", "synonyms.pkl"), 'rb') synonyms = pickle.load(synonyms_file) synonyms_file.close() pattern_matcher = PatternMatcher() connection = pymysql.connect(host='localhost', user='******', password='******'.format(sys.argv[1]), db='stackoverflow') with connection.cursor() as cursor: sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format( start, start + 100) cursor.execute(sql) for i in range(cursor.rowcount): # post_count += 1 current_id, row = cursor.fetchone() word_list = get_words(row) # total_sent_count += len(word_list) for words in word_list: rtn = check_tech_pairs(words) if rtn is not None: words = rtn[0].split(" ") pattern_matcher.match_pattern(words, current_id, rtn[1], "keytechs")
def main(techs, id): ids.append(" ".join(techs)) s.append("") current_id = 0 try: nlp = spacy.load('en') matcher = Matcher(nlp.vocab) add_patterns(matcher) cnx = mysql.connector.connect(host='localhost', user='******', password=pw, db='stackoverflow') cursor = cnx.cursor() query = "SELECT Id, Body FROM {} WHERE ParentId={} AND Score >= 0".format( table_name, id) cursor.execute(query) for current_id, row in cursor.fetchall(): # with open(os.path.join(os.pardir, "usefulness", "{}.txt".format(os.getpid())), "a") as out_file: # out_file.write(str(current_id)+"\n") word_list = get_words(row) for words in word_list: if words == []: continue (words, tags) = get_pos_tag(techs, words) patterns = matcher(nlp(" ".join(tags))) if patterns != []: ids.append(current_id) s.append(" ".join(words)) finally: print(current_id) ids.append("") s.append("")
def __iter__(self): current_id = 0 try: cnx = mysql.connector.connect(host='localhost', user='******', password='******', db='stackoverflow') cursor = cnx.cursor() query = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id < 40000000" cursor.execute(query) for current_id, row in cursor.fetchall(): words_list = get_words(row) for words in words_list: yield words finally: print("current_id: {}".format(current_id))
def main(start): compa_sent_count = 0 total_sent_count = 0 post_count = 0 current_id = 0 try: cnx = mysql.connector.connect(host='localhost', user='******', password=pw, db='stackoverflow') cursor = cnx.cursor() query = "SELECT Id, Body FROM {} WHERE Score >= 0 AND Id >= {} AND Id < {}".format( table_name, start, start + batch) cursor.execute(query) for current_id, row in cursor.fetchall(): post_count += 1 word_list = get_words(row) total_sent_count += len(word_list) for words in word_list: if words == []: continue rtn = check_tech_pairs(words) if rtn is not None: compa_sent_count += 1 data_file = open( os.path.join(os.pardir, "out", table_name, "{}.txt".format(os.getpid())), "a") data_file.write("{}\n".format(current_id)) data_file.write("{}\n".format(rtn[1])) data_file.write("{}\n".format(rtn[0])) data_file.write("\n") data_file.close() finally: print("Proc {}: {}/{} from {} to {} ({} posts)".format( os.getpid(), compa_sent_count, total_sent_count, start, current_id, post_count))
def main(start): compa_sent_count = 0 total_sent_count = 0 post_count = 0 current_id = 0 old_pattern_matcher = OldPatternMatcher() try: pre_words = [] post_words = [] conn = psycopg2.connect( 'dbname=stackoverflow port=5432 host=localhost') cursor = conn.cursor() query = "SELECT Id, Body FROM {} WHERE Score >= 0 AND posttypeid != 1 AND Id >= {} AND Id < {}".format( table_name, start, start + batch) # query = "SELECT Id, Body FROM Posts WHERE Id = 3979" # query = "SELECT Id, Body FROM Posts WHERE Id = 115838 " cursor.execute(query) for current_id, row in cursor.fetchall(): post_count += 1 word_list = get_words(row) total_sent_count += len(word_list) for idx in range(0, len(word_list), 2): if idx == 0: pre_words = [] else: pre_words = word_list[idx - 1] words = word_list[idx] if idx != len(word_list) - 1: post_words = word_list[idx + 1] else: post_words = [] if words == []: continue rtns = check_tech_pairs(pre_words, words, post_words, words, post_words, current_id) for rtn in rtns: if rtn is not None: if len(rtn) == 4: compa_sent_count += 1 data_file = open( os.path.join( os.pardir, "outnew", "oldPattern_{}_v4".format(table_name), "changed_{}.txt".format(os.getpid())), "a") data_file.write("{}\n".format(current_id)) data_file.write("{}\n".format(rtn[1])) data_file.write("Changed: \n{}\n".format(rtn[0])) if rtn[2] == "word": data_file.write("Origin: \n{}\n".format( ' '.join(words))) else: data_file.write("Origin: \n{}\n".format( ' '.join(post_words))) data_file.write("\n\n") data_file.close() old_pattern_matcher.old_match_pattern( rtn[0], current_id, rtn[1], table_name, rtn[-1]) else: compa_sent_count += 1 data_file = open( os.path.join( os.pardir, "outnew", "{}_v4".format(table_name), "leased_{}.txt".format(os.getpid())), "a") data_file.write("{}\n".format(current_id)) data_file.write("{}\n".format(rtn[3])) data_file.write("{}\n".format(rtn[0])) data_file.write("{}\n".format(rtn[1])) data_file.write("{}\n".format(rtn[2])) data_file.write("\n") data_file.close() finally: print("Proc {}: {}/{} from {} to {} ({} posts)".format( os.getpid(), compa_sent_count, total_sent_count, start, current_id, post_count))
def main(start): compa_sent_count = 0 total_sent_count = 0 post_count = 0 current_id = 0 pattern_matcher = PatternMatcher() try: pre_words = [] post_words = [] conn = psycopg2.connect( 'dbname=stackoverflow port=5433 host=localhost') cursor = conn.cursor() # query = "SELECT Id, Body FROM {} WHERE Score > 0 AND posttypeid != 1 AND Id >= {} AND Id < {}".format(table_name, start, start+batch) query = "SELECT Id, Body FROM Posts WHERE Id = 145200 or Id = 6713" cursor.execute(query) for current_id, row in cursor.fetchall(): post_count += 1 word_list = get_words(row) total_sent_count += len(word_list) for idx in range(0, len(word_list), 2): if idx == 0: pre_words = [] else: pre_words = word_list[idx - 1] words = word_list[idx] if idx != len(word_list) - 1: post_words = word_list[idx + 1] else: post_words = [] if words == []: continue rtn = check_tech_pairs(pre_words, words, post_words) if rtn is not None: if len(rtn) == 2: compa_sent_count += 1 data_file = open( os.path.join(os.pardir, "outnew", "{}_v4".format(table_name), "{}.txt".format(os.getpid())), "a") data_file.write("{}\n".format(current_id)) data_file.write("{}\n".format(rtn[1])) data_file.write("{}\n".format(rtn[0])) data_file.write("\n") data_file.close() else: compa_sent_count += 1 data_file = open( os.path.join(os.pardir, "outnew", "{}_v4".format(table_name), "{}.txt".format(os.getpid())), "a") data_file.write("{}\n".format(current_id)) data_file.write("{}\n".format(rtn[3])) data_file.write("{}\n".format(rtn[0])) data_file.write("{}\n".format(rtn[1])) data_file.write("{}\n".format(rtn[2])) data_file.write("\n") data_file.close() pairs = rtn[3].split() known_pairs = [] for x, y in grouped(pairs, 2): if [x, y] not in known_pairs and [ y, x ] not in known_pairs: pattern_matcher.match_pattern( rtn[0], rtn[1], rtn[2], current_id, "{} {}".format(x, y)) known_pairs.append([x, y]) finally: print("Proc {}: {}/{} from {} to {} ({} posts)".format( os.getpid(), compa_sent_count, total_sent_count, start, current_id, post_count))
nlp = spacy.load('en') matcher = Matcher(nlp.vocab) add_patterns(matcher) start_id = start = 20000 end = 50000 try: with connection.cursor() as cursor: while (start < end): sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format( start, start + 100) cursor.execute(sql) for current_id, row in cursor.fetchall(): post_count += 1 # row = cursor.fetchone() word_list = get_words(row) total_sent_count += len(word_list) for sent in word_list: tagged_words = nltk.pos_tag(sent) # print tagged_words tag_list = [] for (word, tag) in tagged_words: if tag == "IN" and word in cin: tag_list.append("CIN") elif tag[:2] == "VB" and word in cv: tag_list.append("CV") else: cursor.execute( "SELECT * FROM Tags WHERE TagName = \'{}\'". format(word))
from prepros import get_words # import mysql.connector from tech_sentences import check_tech_pairs_v2 import os import pickle # cnx = mysql.connector # # cnx = mysql.connector.connect(host='localhost', # user='******', # password='******', # db='stackoverflow') # cursor = cnx.cursor() # query = "SELECT Body FROM Posts WHERE Id=120140" # cursor.execute(query) words = get_words( "Basically he was saying that ASP.NET MVC is not for large-scale enterprise applications" ) print(words[0]) (line, techs) = check_tech_pairs_v2(words[0]) print(line)