Exemple #1
0
def test_parallel(start):
    pairs_file = open(os.path.join(os.pardir, "data", "pairs.pkl"), 'rb')
    pairs = pickle.load(pairs_file)
    pairs_file.close()

    synonyms_file = open(os.path.join(os.pardir, "data", "synonyms.pkl"), 'rb')
    synonyms = pickle.load(synonyms_file)
    synonyms_file.close()
    pattern_matcher = PatternMatcher()
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******'.format(sys.argv[1]),
                                 db='stackoverflow')
    with connection.cursor() as cursor:
        sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format(
            start, start + 100)
        cursor.execute(sql)
        for i in range(cursor.rowcount):
            # post_count += 1
            current_id, row = cursor.fetchone()
            word_list = get_words(row)
            # total_sent_count += len(word_list)

            for words in word_list:
                rtn = check_tech_pairs(words)
                if rtn is not None:
                    words = rtn[0].split(" ")
                    pattern_matcher.match_pattern(words, current_id, rtn[1],
                                                  "keytechs")
def main(techs, id):
    ids.append(" ".join(techs))
    s.append("")
    current_id = 0
    try:
        nlp = spacy.load('en')
        matcher = Matcher(nlp.vocab)
        add_patterns(matcher)
        cnx = mysql.connector.connect(host='localhost',
                                      user='******',
                                      password=pw,
                                      db='stackoverflow')
        cursor = cnx.cursor()
        query = "SELECT Id, Body FROM {} WHERE ParentId={} AND Score >= 0".format(
            table_name, id)
        cursor.execute(query)
        for current_id, row in cursor.fetchall():
            # with open(os.path.join(os.pardir, "usefulness", "{}.txt".format(os.getpid())), "a") as out_file:
            #     out_file.write(str(current_id)+"\n")
            word_list = get_words(row)

            for words in word_list:
                if words == []:
                    continue
                (words, tags) = get_pos_tag(techs, words)
                patterns = matcher(nlp(" ".join(tags)))
                if patterns != []:
                    ids.append(current_id)
                    s.append(" ".join(words))
    finally:
        print(current_id)
        ids.append("")
        s.append("")
Exemple #3
0
 def __iter__(self):
     current_id = 0
     try:
         cnx = mysql.connector.connect(host='localhost',
                                       user='******',
                                       password='******',
                                       db='stackoverflow')
         cursor = cnx.cursor()
         query = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id < 40000000"
         cursor.execute(query)
         for current_id, row in cursor.fetchall():
             words_list = get_words(row)
             for words in words_list:
                 yield words
     finally:
         print("current_id: {}".format(current_id))
def main(start):
    compa_sent_count = 0
    total_sent_count = 0
    post_count = 0
    current_id = 0
    try:
        cnx = mysql.connector.connect(host='localhost',
                                      user='******',
                                      password=pw,
                                      db='stackoverflow')
        cursor = cnx.cursor()
        query = "SELECT Id, Body FROM {} WHERE Score >= 0 AND Id >= {} AND Id < {}".format(
            table_name, start, start + batch)
        cursor.execute(query)
        for current_id, row in cursor.fetchall():
            post_count += 1
            word_list = get_words(row)
            total_sent_count += len(word_list)

            for words in word_list:
                if words == []:
                    continue
                rtn = check_tech_pairs(words)
                if rtn is not None:
                    compa_sent_count += 1
                    data_file = open(
                        os.path.join(os.pardir, "out", table_name,
                                     "{}.txt".format(os.getpid())), "a")
                    data_file.write("{}\n".format(current_id))
                    data_file.write("{}\n".format(rtn[1]))
                    data_file.write("{}\n".format(rtn[0]))
                    data_file.write("\n")
                    data_file.close()
    finally:
        print("Proc {}: {}/{} from {} to {} ({} posts)".format(
            os.getpid(), compa_sent_count, total_sent_count, start, current_id,
            post_count))
Exemple #5
0
def main(start):
    compa_sent_count = 0
    total_sent_count = 0
    post_count = 0
    current_id = 0
    old_pattern_matcher = OldPatternMatcher()

    try:
        pre_words = []
        post_words = []
        conn = psycopg2.connect(
            'dbname=stackoverflow port=5432 host=localhost')
        cursor = conn.cursor()
        query = "SELECT Id, Body FROM {} WHERE Score >= 0 AND posttypeid != 1 AND Id >= {} AND Id < {}".format(
            table_name, start, start + batch)
        # query = "SELECT Id, Body FROM Posts WHERE Id = 3979"
        # query = "SELECT Id, Body FROM Posts WHERE Id = 115838 "
        cursor.execute(query)

        for current_id, row in cursor.fetchall():

            post_count += 1
            word_list = get_words(row)
            total_sent_count += len(word_list)

            for idx in range(0, len(word_list), 2):
                if idx == 0:
                    pre_words = []
                else:
                    pre_words = word_list[idx - 1]
                words = word_list[idx]
                if idx != len(word_list) - 1:
                    post_words = word_list[idx + 1]
                else:
                    post_words = []
                if words == []:
                    continue

                rtns = check_tech_pairs(pre_words, words, post_words, words,
                                        post_words, current_id)
                for rtn in rtns:
                    if rtn is not None:
                        if len(rtn) == 4:
                            compa_sent_count += 1
                            data_file = open(
                                os.path.join(
                                    os.pardir, "outnew",
                                    "oldPattern_{}_v4".format(table_name),
                                    "changed_{}.txt".format(os.getpid())), "a")
                            data_file.write("{}\n".format(current_id))
                            data_file.write("{}\n".format(rtn[1]))
                            data_file.write("Changed: \n{}\n".format(rtn[0]))
                            if rtn[2] == "word":
                                data_file.write("Origin: \n{}\n".format(
                                    ' '.join(words)))
                            else:
                                data_file.write("Origin: \n{}\n".format(
                                    ' '.join(post_words)))
                            data_file.write("\n\n")
                            data_file.close()
                            old_pattern_matcher.old_match_pattern(
                                rtn[0], current_id, rtn[1], table_name,
                                rtn[-1])
                        else:
                            compa_sent_count += 1
                            data_file = open(
                                os.path.join(
                                    os.pardir, "outnew",
                                    "{}_v4".format(table_name),
                                    "leased_{}.txt".format(os.getpid())), "a")
                            data_file.write("{}\n".format(current_id))
                            data_file.write("{}\n".format(rtn[3]))
                            data_file.write("{}\n".format(rtn[0]))
                            data_file.write("{}\n".format(rtn[1]))
                            data_file.write("{}\n".format(rtn[2]))
                            data_file.write("\n")
                            data_file.close()

    finally:
        print("Proc {}: {}/{} from {} to {} ({} posts)".format(
            os.getpid(), compa_sent_count, total_sent_count, start, current_id,
            post_count))
Exemple #6
0
def main(start):
    compa_sent_count = 0
    total_sent_count = 0
    post_count = 0
    current_id = 0
    pattern_matcher = PatternMatcher()
    try:
        pre_words = []
        post_words = []
        conn = psycopg2.connect(
            'dbname=stackoverflow port=5433 host=localhost')
        cursor = conn.cursor()
        # query = "SELECT Id, Body FROM {} WHERE Score > 0 AND posttypeid != 1 AND Id >= {} AND Id < {}".format(table_name, start, start+batch)
        query = "SELECT Id, Body FROM Posts WHERE Id = 145200 or Id = 6713"
        cursor.execute(query)
        for current_id, row in cursor.fetchall():
            post_count += 1
            word_list = get_words(row)
            total_sent_count += len(word_list)

            for idx in range(0, len(word_list), 2):
                if idx == 0:
                    pre_words = []
                else:
                    pre_words = word_list[idx - 1]
                words = word_list[idx]
                if idx != len(word_list) - 1:
                    post_words = word_list[idx + 1]
                else:
                    post_words = []
                if words == []:
                    continue

                rtn = check_tech_pairs(pre_words, words, post_words)
                if rtn is not None:
                    if len(rtn) == 2:
                        compa_sent_count += 1
                        data_file = open(
                            os.path.join(os.pardir, "outnew",
                                         "{}_v4".format(table_name),
                                         "{}.txt".format(os.getpid())), "a")
                        data_file.write("{}\n".format(current_id))
                        data_file.write("{}\n".format(rtn[1]))
                        data_file.write("{}\n".format(rtn[0]))
                        data_file.write("\n")
                        data_file.close()
                    else:
                        compa_sent_count += 1
                        data_file = open(
                            os.path.join(os.pardir, "outnew",
                                         "{}_v4".format(table_name),
                                         "{}.txt".format(os.getpid())), "a")
                        data_file.write("{}\n".format(current_id))
                        data_file.write("{}\n".format(rtn[3]))
                        data_file.write("{}\n".format(rtn[0]))
                        data_file.write("{}\n".format(rtn[1]))
                        data_file.write("{}\n".format(rtn[2]))
                        data_file.write("\n")
                        data_file.close()
                        pairs = rtn[3].split()
                        known_pairs = []
                        for x, y in grouped(pairs, 2):
                            if [x, y] not in known_pairs and [
                                    y, x
                            ] not in known_pairs:
                                pattern_matcher.match_pattern(
                                    rtn[0], rtn[1], rtn[2], current_id,
                                    "{} {}".format(x, y))
                            known_pairs.append([x, y])

    finally:
        print("Proc {}: {}/{} from {} to {} ({} posts)".format(
            os.getpid(), compa_sent_count, total_sent_count, start, current_id,
            post_count))
Exemple #7
0
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
add_patterns(matcher)

start_id = start = 20000
end = 50000
try:
    with connection.cursor() as cursor:
        while (start < end):
            sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format(
                start, start + 100)
            cursor.execute(sql)
            for current_id, row in cursor.fetchall():
                post_count += 1
                # row = cursor.fetchone()
                word_list = get_words(row)
                total_sent_count += len(word_list)

                for sent in word_list:
                    tagged_words = nltk.pos_tag(sent)
                    # print tagged_words
                    tag_list = []
                    for (word, tag) in tagged_words:
                        if tag == "IN" and word in cin:
                            tag_list.append("CIN")
                        elif tag[:2] == "VB" and word in cv:
                            tag_list.append("CV")
                        else:
                            cursor.execute(
                                "SELECT * FROM Tags WHERE TagName = \'{}\'".
                                format(word))
Exemple #8
0
from prepros import get_words
# import mysql.connector
from tech_sentences import check_tech_pairs_v2
import os
import pickle

# cnx = mysql.connector
#
# cnx = mysql.connector.connect(host='localhost',
#                               user='******',
#                               password='******',
#                               db='stackoverflow')
# cursor = cnx.cursor()
# query = "SELECT Body FROM Posts WHERE Id=120140"
# cursor.execute(query)

words = get_words(
    "Basically he was saying that ASP.NET MVC is not for large-scale enterprise applications"
)
print(words[0])
(line, techs) = check_tech_pairs_v2(words[0])
print(line)