Exemple #1
0
def get_spells(request):
    conn = db_connection(request)
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM spells;")
    result = cursor.fetchall()
    conn.close()
    return result
 def do(originfile):
     f = open(originfile, "r")
     for line in f:
         liketextgood = ''
         liketextbad = ''
         kw = line[:-1]  # important to have last line with \n
         fs = open(
             "subkeywords_booking/subkeywords_booking_cleaned/" + kw +
             ".txt", "r")
         l = []
         for subkw in fs:
             l.append(subkw)
             if '\'' in subkw:
                 subkw = subkw.replace('\'', "''")
             subkw = subkw[:-1]
             if subkw != '':
                 liketextgood += ' or good LIKE \'%' + subkw + '%\''
                 liketextbad += ' or bad LIKE \'%' + subkw + '%\''
         fs.close()
         liketextgood = liketextgood[4:]
         liketextgood += ';'
         liketextbad = liketextbad[4:]
         liketextbad += ';'
         db = db_connection()
         queryexecutor = db_operator(db)
         db.connect()
         query_good = 'CREATE VIEW ' + kw + '_good_view AS SELECT * FROM masterthesis.reviews where ' + liketextgood
         query_bad = 'CREATE VIEW ' + kw + '_bad_view AS SELECT * FROM masterthesis.reviews where ' + liketextbad
         print(query_bad)
         print(query_good)
         viewgood = queryexecutor.execute(query=query_good)
         viewbad = queryexecutor.execute(query=query_bad)
         db.disconnect()
     f.close()
Exemple #3
0
def fetchall_cursor(fetchall_query):
    cursor = db_connection()
    cursor.execute(fetchall_query)
    row = cursor.fetchall()
    resp = jsonify(row)
    resp.status_code = 200
    return resp
Exemple #4
0
def mark_spell_complete(request, castTime):
    conn = db_connection(request)
    cursor = conn.cursor()
    result = _mark_spell_complete(cursor, castTime)
    conn.commit()
    conn.close()
    return result
def option_two():

    documents = db_connection()

    menu.movies_registred()

    for dic in documents.find({}):
        movie = MovieFromDb(dic["name"], dic["theater"] ,dic["room"] ,dic["time"])
        print(movie)
    
    exit()
def option_three():

    documents = db_connection()

    title = validation.name_input(title="")

    find_document = documents.find_one({"name":f"{title}"},projection={"_id": False})

    movie = MovieFromDb(find_document["name"], find_document["theater"] ,find_document["room"] ,find_document["time"])

    menu.movies_registred()
    print(movie)

    exit()
Exemple #7
0
def getRawCorpus(csv_file,
                 id_and_country=False,
                 additionaldetails=False,
                 limit=-1,
                 offset=0):
    raw_corpus = []
    reader = csv.reader(csv_file, delimiter='|', quotechar='"')
    i = 0
    if additionaldetails:
        db = db_connection()
        queryexecutor = db_operator(db)
        db.connect()
        for row in reader:
            if i % 50000 == 0 and i != 0:
                print('reading sentence ' + str(i))
            i += 1
            if i > offset and i <= offset + limit or limit == -1:
                id = row[0]
                query = 'SELECT HotelNumber, FamilyType FROM masterthesis.reviews WHERE ReviewID=' + id
                det = queryexecutor.execute(query=query)
                if len(det) <= 0:
                    det = [('no_hotel_number'), ('no_family_type')]
                det = [det[0][0], det[0][1]]
                query = 'SELECT CountryID FROM masterthesis.hotels WHERE HotelNumber=' + str(
                    det[0])
                hot = queryexecutor.execute(query=query)
                if len(hot) <= 0:
                    hot = [('no_country', )]
                hot = [hot[0][0]]
                det = det + hot
                raw_corpus.append(row + det)
            if i > offset + limit and limit > 0:
                break
        db.disconnect()
        return raw_corpus
    if id_and_country:
        for row in reader:
            if i % 50000 == 0 and i != 0:
                print('reading sentence ' + str(i))
            i += 1
            raw_corpus.append(row)
    else:
        for row in reader:
            if i % 50000 == 0 and i != 0:
                print('reading sentence ' + str(i))
            i += 1
            raw_corpus.append(row[2])
    csv_file.close()
    return raw_corpus
Exemple #8
0
def main(scrapestreetlinks=None,
         scrapepdflinks=None,
         download=None,
         parse=None,
         save=None):
    links = []
    if scrapestreetlinks:
        links = get_bin_links()
    if scrapepdflinks:
        if not len(links):
            links = pd.read_csv(LINKSCSV)
        links['pdf_url'] = links['uri'].apply(get_collection_pdf_link)
        links.to_csv("binlinks.csv")
    if download:
        if not len(links):
            links = pd.read_csv(LINKSCSV)
        get_collection_pdfs(links)
    if parse:
        if not len(links):
            links = pd.read_csv(LINKSCSV)
        links['filename'] = links['pdf_url'].str.split("/").str[-1]
        times = parse_pdfs()
        bintimes = pd.merge(links,
                            times,
                            left_on='filename',
                            right_on='filename',
                            how='right').drop(columns=['Unnamed: 0'])
        bintimes.to_csv('bintimes.csv')

    if save:
        bintimes = pd.read_csv('bintimes.csv')
        bindicts = bintimes.to_dict(orient='records')
        for idx, bindict in enumerate(bindicts):
            datestr = bindict['date']
            if not isinstance(datestr, str):
                datestr = datetime.datetime.strftime(bindict['date'],
                                                     '%Y-%m-%d')
            bindict['_id'] = bindict['street'] + '_' + datestr
            bindict['_id'] = bindict['_id'].replace(' ', '_').lower()
            bindict['city'] = 'Edinburgh'.lower()
            bindicts[idx] = bindict
        print('upserting {} records'.format(len(bindicts)))
        bulk_upsert(db_connection(), 'bindays', 'days', bindicts)
Exemple #9
0
def create_spell(request, params):
    conn = db_connection(request)
    cursor = conn.cursor()
    spellTime = int(time.time())
    # We use spellTime as a primary key. So if we should happen to get two spells
    # at the same second, pretend like the second came a second later.
    while _get_spell_by_time(cursor, spellTime):
        spellTime += 1
    try:
        assert(_authenticate_user(params['user_name'], params['spirit_animal'],
                cursor))
        assert(isinstance(params['name'], basestring))
        assert(params['name'] != '')
        assert(params['setup'] or params['loop'])
        for component in ['setup', 'loop']:
            if params[component]:
                for frame in params[component]:
                    try:
                        assert(validate_frame(frame))
                    except:
                        log.debug(frame)
                        raise AssertionError()
    except IOError():
        return False
    setup = json.dumps(params['setup']) if params['setup'] else ''
    loop = json.dumps(params['loop']) if params['loop'] else ''
    cursor.execute('INSERT INTO spells VALUES (?,?,?,?,?,?,?)', (
        params['user_name'],
        params['name'], 
        3,
        spellTime,
        setup,
        loop,
        0
    ))
    conn.commit()
    newSpell = _get_spell_by_time(cursor, spellTime)
    conn.close()
    return newSpell
Exemple #10
0
from datetime import date, datetime

app = Flask(__name__)
app.config['CORS_HEADERS'] = 'Content-Type'
cors = CORS(app, resources={r"*": {"origins": "*"}})
api = Api(app)

parser = reqparse.RequestParser()
parser.add_argument('term')

HEADERS = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Headers': '*'
}

conn = db_connection()


class SearchStreets(Resource):
    def get(self):
        search_term = request.args.get('q')
        streets = search_streets(conn, str(search_term))
        return output_json(streets, 200)


class Street(Resource):
    def get(self, city, street):
        bindays = get_days(conn, city, street)
        return output_json(bindays, 200)

Exemple #11
0
def cursor_request(*args):
    cursor = db_connection()
    cursor.execute(*args)
    mysql.connection.commit()
    cursor.close()
from flask import Flask, jsonify, render_template
from db import db_connection

app = Flask(__name__)

db = db_connection()
connection = db.connect_to_db(app)


@app.route('/songs')
def get_songs():

    songs = db.get_songs(connection)
    print(songs)

    return render_template("songs.html", songs=songs)


@app.route('/')
def home():

    return render_template("home.html")


if __name__ == "__main__":
    print('Flask app is running...')
    app.run('0.0.0.0', '8080')
    print('...Flask app has stopped.')
def analyze(originfile, all=False):
    keywords = helper.getKeywords(originfile)
    os.chdir('./resources/stanford-corenlp-full-2018-10-05')
    os.system('kill $(lsof -t -i:9000)')
    cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &'
    time.sleep(4)
    print("starting nlp service")
    with open(os.devnull, "w") as f:
        subprocess.call(cmd, shell=True, stderr=f, stdout=f)
    time.sleep(4)
    print("nlp service started")
    os.chdir('../../')
    nlp_wrapper = StanfordCoreNLP('http://localhost:9000')
    print("Number of processors: ", mp.cpu_count())
    if all:
        print("all")
        '''if not os.path.isfile('/resources/all_test.csv'):
            print("test file created")
            open('./resources/all_test.csv', 'w').close()'''
        conn = db.db_connection()
        dbo = db.db_operator(conn)
        spell = SpellChecker()
        counter = Value('i', 1)
        corpus_tok_all=[]
        '''for i in range(1790):
            print('i=' +str(i))
            print("limit= 10000")
            print("offset= "+str(10000*i))
            conn.connect()
            query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \
                    'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \
                    'FROM masterthesis.reviews, masterthesis.hotels ' \
                    'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';'
            results = [list(x) for x in dbo.execute(query)];
            conn.disconnect()
            print("got results from sql")
            print("starting analysis")
            print("tot number rows= " + str(len(results)))
            try:
                print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), )
                corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200)
                pool.close()
                pool.terminate()
                pool.join()
                print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            except TimeoutError:
                print("timeout error")
                pool.close()
                pool.terminate()
                pool.join()
                corpus_tok=[]
                for doc in results:
                    try:
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60)
                        #print('pool close')
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()

                    except TimeoutError:
                        print(str(doc)+" caused Exception")
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()
                        c=[None]
                    corpus_tok.append(c[0])
            print("beginning removal of sents with contrast")
            corpus_tok = [r for r in corpus_tok if r != None]
            print('len corpus_tok_reduced= '+str(len(corpus_tok)))
            corpus_tok_all+=corpus_tok
            print('len corpus_tok_all= ' + str(len(corpus_tok_all)))
            if i%100==0 and i!=0:
                with open('./resources/all_test.csv', mode='a') as file:
                    writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    for c in corpus_tok_all:
                        writer.writerow(c)
                file.close()
                corpus_tok_all=[]
        '''


        '''
        corpus_tok_all=[]
        i=0
        kk=set()
        with open('./resources/all_test.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                #if i%10000==0:break
                ar=((row[0].replace('[','')).replace(']','')).split(',')
                if ar[1][-1]!="'":#France, Metro.
                    ar[1]=ar[1]+','+ar[2]
                    for j in range(2,len(ar)-1):
                        ar[j]=ar[j+1]
                    del ar[len(ar)-1]
                ar[1]=ar[1][2:-1]
                ar[2] = (ar[2].replace("'", '')).replace(' ', '')
                rev=''.join(ar[3:])
                revlist= ar[:3]
                revlist.append(rev)
                tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',')
                r=(revlist,tokens)
                k=ar[0]
                if k not in kk:
                    kk.add(k)
                    corpus_tok_all.append(r)
        file.close()
        corpus_tok=corpus_tok_all
        corpustokonly = [r[1] for r in corpus_tok]
        print("doing bigrams")
        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
        lenc=len(corpus_tok)
        print("corpus_tok len = "+str(lenc))
        for idx in range(lenc):
            if idx%100000==0:
                print(idx)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            for token in bigram[corpustokonly[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    corpus_tok[idx][1].append(token)
        with open('./resources/corpus_tok_all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                writer.writerows(corpus_tok)
        file.close()
        print("corpus_tok written")
        from gensim.corpora import Dictionary
        print("writing frequence file")
        '''

        



        '''all_set=set()
        for emotion in ['Good', 'Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if not (keyword == 'cleaning' or keyword=='pet'):
                    start_time = time.time()
                    print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    raw_corpus = helper.getRawCorpus(
                        csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                      encoding="utf8", newline='\n'), additionaldetails=True)
                    # corpus = helper.getCorpusTextFromRaw(raw_corpus)
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    print("starting analysis")
                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,
                                   initargs=(counter, spell, nlp_wrapper,), )
                    corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get()
                    print('pool close')
                    pool.close()
                    print('pool join')
                    pool.join()
                    print("beginning removal of sents with contrast")
                    corpus_tok = [r for r in corpus_tok if r != None]
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    print('len all_set_tok before= ' + str(len(all_set)))
                    print('len corpus_tok= ' + str(len(corpus_tok)))
                    print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set)))
                    for sen in corpus_tok:
                        all_set.add((tuple(sen[0]),tuple(sen[1])))
                    print('len all_set_tok after= ' + str(len(all_set)))
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
        # Compute bigrams.
        if len(all_set) > 0:
            corpus_tok=[(list(x[0]),list(x[1])) for x in all_set]
            corpustokonly = [r[1] for r in corpus_tok]
            print("doing bigrams")
            # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
            bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
            for idx in range(len(corpus_tok)):
                for token in bigram[corpustokonly[idx]]:
                    if '_' in token:
                        # Token is a bigram, add to document.
                        corpus_tok[idx][1].append(token)
            from gensim.corpora import Dictionary
            print("writing frequence file")

            # Create a dictionary representation of the documents.
            dictionary = Dictionary(corpustokonly)

            alltok = []
            freq = []
            for doc in corpustokonly:
                for tok in doc:
                    alltok.append(tok)
            lencorpus = len(corpus_tok)
            print("len dictionary = " + str(len(dictionary.keys())))
            i = 0
            for t in dictionary:
                i += 1
                if i % 1000 == 0:
                    print("analyzing token " + str(i))
                freqsent = 0
                for doc in corpustokonly:
                    if dictionary.get(t) in doc:
                        freqsent += 1
                freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)),
                             alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus))
            freq.sort(key=lambda tup: tup[5], reverse=True)
            for i in range(len(freq)):
                freq[i] = tuple(list(freq[i]) + [i])
            if not os.path.exists('resources/bow/allfreq/stanford/'):
                os.makedirs('resources/bow/allfreq/stanford/')
            with open('resources/bow/allfreq/stanford/all.txt',
                      'w') as f:
                for item in freq:
                    f.write(str(item) + '\n')
                f.close()

            print("writing bow file")
            top_tokens = [f[1] for f in freq[:500]]
            lentoptok = len(top_tokens)
            corpus_bow = {}
            toplen = 0
            for i in range(len(corpus_tok)):
                corpus_bow[i] = [0] * lentoptok
                if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                    toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
                for tok in corpus_tok[i][1]:
                    if tok in top_tokens:
                        corpus_bow[i][top_tokens.index(tok)] = 1

            with open('resources/bow/all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow([''] * toplen + top_tokens)
                for i in corpus_bow.keys():
                    writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * (
                            toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i])
            file.close()
        '''
        


        # Create a dictionary representation of the documents.
        '''dictionary = Dictionary(corpustokonly)

        alltok = []
        freq = []
        for doc in corpustokonly:
            for tok in doc:
                alltok.append(tok)
        lencorpus = len(corpus_tok)
        print("len dictionary = " + str(len(dictionary.keys())))
        time.sleep(100000)
        counter = Value('i', 0)
        pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), )
        print("pool initialized")
        corpustokonly=None
        alltok=None
        del corpustokonly, alltok
        freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get()
        pool.close()
        pool.terminate()
        pool.join()
        dictionary=None
        del dictionary
        global ctonly, dic, alltoks
        ctonly=None
        dic=None
        alltoks=None
        del ctonly,dic,alltoks
        print("frequence list len= "+str(len(freq)))
        print("frequence list created")
        freq.sort(key=lambda tup: tup[5], reverse=True)
        print("frequence list sorted")
        for i in range(len(freq)):
            if i%10000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            freq[i] = tuple(list(freq[i]) + [i])
        print("frequence list modified")
        if not os.path.exists('resources/bow/allfreq/stanford/'):
            os.makedirs('resources/bow/allfreq/stanford/')
        i=0
        '''
        '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f:
            for item in freq:
                i+=1
                if i%10000==0:
                    print(i)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                f.write(str(item) + '\n')
            f.close()'''

        corpus_tok=[]
        i=0
        with open('./resources/corpus_tok_all.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                corpus_tok.append(row)
        file.close()
        print("len corpus_tok= "+str(len(corpus_tok)))
        freq=[]
        i=0
        with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i==501:break
                freq.append(row)
        file.close()
        for i in range(len(freq)):
            freq[i]=freq[i][0]
            freq[i]=freq[i].replace("'",'')
            freq[i]=freq[i].replace('"','')
            freq[i]=freq[i].replace('(','')
            freq[i]=freq[i].replace(')','')
            freq[i]=freq[i].replace(' ','')
            freq[i]=freq[i].split(',')
            freq[i]=tuple(freq[i])
        for i in range(len(corpus_tok)):
            if i%100000==0:
                print(i)
            corpus_tok[i][0]=corpus_tok[i][0].replace('[','')
            corpus_tok[i][0]=corpus_tok[i][0].replace(']','')
            det=(corpus_tok[i][0].split(','))
            if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe
                det[1]='  '+'São Tomé and PrÃ\xadncipe'+' '
            if det[1][-1]!="'":#France, Metro
                if 'Ivoire' in det[1]:#Cote d'Ivoire
                    det[1]=det[1].replace('\\','')
                    det[2]=det[2][1:]
                else:
                    det[1]=det[1]+','+det[2]
                    for j in range(2,len(det)-1):
                        det[j]=det[j+1]
                    del det[len(det)-1]
            det=det[:3]
            desc=(corpus_tok[i][0].split(','))[-1]
            det[0]=det[0][1:-1]
            det[1]=det[1][2:-1]
            det[2]=det[2][2:-1]
            desc=desc[3:-1]
            det.append(desc)
            corpus_tok[i][0]=det
            corpus_tok[i][1]=corpus_tok[i][1].replace("'",'')
            corpus_tok[i][1]=corpus_tok[i][1].replace(' ','')
            corpus_tok[i][1]=corpus_tok[i][1].replace('[','')
            corpus_tok[i][1]=corpus_tok[i][1].replace(']','')
            corpus_tok[i][1]=corpus_tok[i][1].split(',')
        print("writing bow file")
        top_tokens = [f[1] for f in freq[:400]]
        lentoptok = len(top_tokens)
        corpus_bow = {}
        toplen = 0
        print("corpus_tok_len= "+str(len(corpus_tok)))
        for i in range(len(corpus_tok)):
            corpus_bow[i] = [0] * lentoptok
            if i%100000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
            for tok in corpus_tok[i][1]:
                if tok in top_tokens:
                    corpus_bow[i][top_tokens.index(tok)] = 1
        print("len corpus_bow keys= "+str(len(corpus_bow.keys())))
        print("got corpus_bow")
        j=0
        print("corpus_bow_len "+str(len(corpus_bow)))
        with open('resources/bow/all.csv', mode='w') as file:
            writer = csv.writer(file, delimiter='|', quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([''] * toplen + top_tokens)
            for i in corpus_bow.keys():
                j+=1
                if j%100000==0:
                    print(j)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                writer.writerow(
                    corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) +
                    corpus_bow[i])
        file.close()
        print("over")
    else:
        print("not all")
        for emotion in ['Good','Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if emotion=='Good' and keyword=='cleaning':#cleaning good
                    start_time = time.time()
                    print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    corpus_tok_all=[]
                    #if not os.path.isfile('/resources/cleaning_test.csv'):
                        #open('./resources/cleaning_test.csv', 'w').close()
                    for i in range(400):#400
                        print(str(i))
                        offset=i*1000
                        limit=1000
                        print("starting reading")
                        print("limit="+str(limit))
                        print("offset="+str(offset))
                        raw_corpus = helper.getRawCorpus(
                            csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                          encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset)

                        #corpus = helper.getCorpusTextFromRaw(raw_corpus)
                        #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)]
                        #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):]
                        print("starting analysis")
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        try:
                            corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30)
                            pool.close()
                            pool.join()
                        except TimeoutError:
                            print("timeout error")
                            print('pool close')
                            pool.close()
                            print('pool terminate')
                            pool.terminate()
                            print('pool join')
                            pool.join()
                            corpus_tok=[]
                            for doc in raw_corpus:
                                try:
                                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                                    c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30)
                                    #print('pool close')
                                    pool.close()
                                    #print('pool join')
                                    pool.join()
                                    '''thread = threading.Thread(target = thread_function_row_only, args = (doc))
                                    thread.start()
                                    thread.join()
                                    c=que.get()'''
                                except TimeoutError:
                                    print(str(doc)+" caused Exception")
                                    c=[None]
                                corpus_tok.append(c[0])
                        corpus_tok_reduced=[r for r in corpus_tok if r != None]
                        print("len corpus_tok: " + str(len(corpus_tok)))
                        print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced)))
                        '''with open('./resources/cleaning_test.csv', mode='a') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                            for c in corpus_tok_reduced:
                                writer.writerow(c)
                        file.close()'''
                        corpus_tok_all+=corpus_tok_reduced
                        print("len corpus_tok_all: " + str(len(corpus_tok_all)))
                    '''
                    corpus_tok=[]
                    s=0
                    for doc in corpus:
                        newdoc=False
                        doc = doc.lower()
                        s += 1
                        if s % 10000 == 0:
                            print(str(s))
                        for con in constr_conjs:
                            if con in doc:
                                newdoc=True
                                break
                        if not newdoc:
                            toks = [spell.correction(tok['lemma']) for tok in
                                    nlp_wrapper.annotate(doc,
                                                         properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[
                                        'sentences'][0]['tokens']
                                    if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1]
                            toapp = []
                            for i in range(len(toks)):
                                if '/' in toks[i]:
                                    for tok in toks[i].split('/'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            toapp = []
                            for i in range(len(toks)):
                                if '-' in toks[i]:
                                    for tok in toks[i].split('-'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            corpus_tok.append(toks)'''
                    #print("beginning removal of sents with contrast")
                    corpus_tok=corpus_tok_all
                    print("len corpus_tok: " + str(len(corpus_tok)))
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    # Compute bigrams.
                    if len(corpus_tok)>0:
                        corpustokonly=[r[1] for r in corpus_tok]
                        print("doing bigrams")
                        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
                        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
                        for idx in range(len(corpus_tok)):
                            for token in bigram[corpustokonly[idx]]:
                                if '_' in token:
                                    # Token is a bigram, add to document.
                                    corpus_tok[idx][1].append(token)
                        from gensim.corpora import Dictionary
                        print("writing frequence file")

                        # Create a dictionary representation of the documents.
                        dictionary = Dictionary(corpustokonly)

                        alltok = []
                        freq=[]
                        for doc in corpustokonly:
                            for tok in doc:
                                alltok.append(tok)
                        lencorpus=len(corpus_tok)
                        print("len dictionary = "+str(len(dictionary.keys())))
                        i=0
                        for t in dictionary:
                            i+=1
                            if i%1000==0:
                                print("analyzing token "+str(i))
                            freqsent = 0
                            for doc in corpustokonly:
                                if dictionary.get(t) in doc:
                                    freqsent+=1
                            freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus))
                        freq.sort(key=lambda tup: tup[5], reverse=True)
                        for i in range(len(freq)):
                            freq[i]=tuple(list(freq[i])+[i])
                        if not os.path.exists('resources/bow/allfreq/stanford/'):
                            os.makedirs('resources/bow/allfreq/stanford/')
                        with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f:
                            for item in freq:
                                f.write(str(item)+'\n')
                            f.close()

                        print("writing bow file")
                        top_tokens=[f[1] for f in freq[:500]]
                        lentoptok=len(top_tokens)
                        corpus_bow={}
                        toplen=0
                        for i in range(len(corpus_tok)):
                            corpus_bow[i]=[0]*lentoptok
                            if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen:
                                toplen=len(corpus_tok[i][0]+corpus_tok[i][1])
                            for tok in corpus_tok[i][1]:
                                if tok in top_tokens:
                                    corpus_bow[i][top_tokens.index(tok)]=1

                        with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"',
                                                         quoting=csv.QUOTE_MINIMAL)
                            writer.writerow(['']*toplen+top_tokens)
                            for i in corpus_bow.keys():
                                writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i])
                        file.close()
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
    f.close()
def build_country_indices():
    db = db_connection()
    db.connect()
    queryexecutor = db_operator(db)
    query = 'select distinct(Country) from masterthesis.reviews;'
    print("retrieving countries of tourists")
    tourcountries = [x[0] for x in queryexecutor.execute(query=query)][1:]

    query = 'select distinct(CountryID) from masterthesis.hotels;'
    print("retrieving countries of hotels")
    hotcountries = [x[0] for x in queryexecutor.execute(query=query)]
    db.disconnect()
    special_countries = list()
    country_to_code = dict()
    #https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
    country_to_code['Antigua &amp; Barbuda'] = 'ag'
    country_to_code['Bonaire St Eustatius and Saba'] = 'bq'
    country_to_code['Cape Verde'] = 'cv'
    country_to_code['Central Africa Republic'] = 'cf'
    country_to_code['Cocos (K) I.'] = 'cc'
    country_to_code['Curaçao'] = 'cw'
    country_to_code['Democratic Republic of the Congo'] = 'cd'
    country_to_code['East Timor'] = 'tl'
    country_to_code['Equitorial Guinea'] = 'gq'
    country_to_code['France, Metro.'] = 'fr'
    country_to_code['Heard and McDonald Islands'] = 'hm'
    country_to_code['Laos'] = 'la'
    country_to_code['Netherlands Antilles'] = 'nt'
    country_to_code['North Korea'] = 'kp'
    country_to_code['Palestinian Territory'] = 'ps'
    country_to_code['Saint Vincent &amp; Grenadines'] = 'vc'
    country_to_code['São Tomé and Príncipe'] = 'st'
    country_to_code['Serbia and Montenegro'] = 'em'
    country_to_code['South Korea'] = 'kr'
    country_to_code['St. Helena'] = 'sh'
    country_to_code['St. Pierre and Miquelon'] = 'pm'
    country_to_code['Svalbard &amp; Jan Mayen'] = 'sj'
    country_to_code['Swaziland'] = 'sz'
    country_to_code['Turks &amp; Caicos Islands'] = 'tc'
    country_to_code['U.K. Virgin Islands'] = 'vg'
    country_to_code['U.S. Virgin Islands'] = 'vi'
    country_to_code['U.S.A.'] = 'us'

    for key in country_to_code.keys():
        special_countries.append(key)
    code_to_country = dict()
    for k, v in country_to_code.items():
        code_to_country[v] = k
    for cont in tourcountries:
        try:
            code = pycountry.countries.search_fuzzy(cont)[0].alpha_2.lower()
        except Exception as e:
            None
        if code not in code_to_country:
            code_to_country[code] = cont
        if cont not in country_to_code:
            country_to_code[cont] = code
    for cont in hotcountries:
        cname = (pycountry.countries.get(alpha_2=cont.upper())).name
        if cont not in code_to_country.keys():
            code_to_country[cont] = cname
            country_to_code[cname] = cont
    print("writing the indices")
    with open('resources/tourist_country_index.csv', mode='w') as file:
        writer = csv.writer(file,
                            delimiter='|',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for i in range(1, len(list(country_to_code.keys())) + 1):
            writer.writerow([i, list(country_to_code.keys())[i - 1]])
        i += 1
        writer.writerow([i, 'no_country'])
        i += 1
        writer.writerow([i, ''])
    file.close()
    with open('resources/hotel_country_index.csv', mode='w') as file:
        writer = csv.writer(file,
                            delimiter='|',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for i in range(1, len(list(code_to_country.keys())) + 1):
            writer.writerow([i, list(code_to_country.keys())[i - 1]])
        writer.writerow([i + 1, 'no_country'])
    file.close()
    with open('resources/country_to_code.csv', mode='w') as file:
        writer = csv.writer(file,
                            delimiter='|',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for key in [
                x[0] for x in sorted(country_to_code.items(),
                                     key=operator.itemgetter(1),
                                     reverse=False)
        ]:
            writer.writerow([key, country_to_code[key]])
    file.close()
    with open('resources/code_to_country.csv', mode='w') as file:
        writer = csv.writer(file,
                            delimiter='|',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for key in [
                x[0] for x in sorted(code_to_country.items(),
                                     key=operator.itemgetter(0),
                                     reverse=False)
        ]:
            writer.writerow([key, code_to_country[key]])
    file.close()
    print("writing countries indices over")
Exemple #15
0
def get_spell_by_time(request, castTime):
    conn = db_connection(request)
    cursor = conn.cursor()
    spell = _get_spell_by_time(cursor, castTime)
    conn.close()
    return spell
Exemple #16
0
def get_current_spells(request):
    conn = db_connection(request)
    cursor = conn.cursor()
    currentSpells = _get_current_spells(cursor)
    conn.close()
    return currentSpells
Exemple #17
0
import mysql.connector
import db
mydb = db.db_connection("localhost", "root", "root", "python_practise")
# mydb = mysql.connector.connect(
#     host="localhost",
#     user="******",
#     passwd="root",
#     database="python_practise"
# )
mycursor = mydb.cursor()

mycursor.execute("SELECT * from customer")

myresult = mycursor.fetchall()
mySingleResult = mycursor.fetchone()
for x in myresult:
    print(x)

print(mySingleResult)
def do(originfile):
    start_time = time.time()
    db = db_connection()
    queryexecutor = db_operator(db)
    keywords = {}
    f = open(originfile, "r")
    for line in f:
        keyword = line[:-1]  # important to have last line with \n
        keywords[keyword] = []
        fs = open(
            "subkeywords_booking/subkeywords_booking_cleaned/" + keyword +
            ".txt", "r")
        for linesub in fs:
            keywords[keyword].append(
                linesub[:-1])  # important to have last line with \n
        fs.close()
    f.close()
    #nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    #nlp.add_pipe(LanguageDetector(language_detection_function=custom_detection_function), name="language_detector",last=True)
    print("Number of processors: ", mp.cpu_count())
    for emotion in ['Good', 'Bad']:
        print("begin " + emotion)
        for keyword in keywords.keys():
            print(keyword)
            '''f=open('csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='w')
            f.close()
            liketext = 'SELECT ReviewID, Country, ' + emotion + ' from masterthesis.reviews where '
            '''
            subkeywords = keywords[keyword]
            '''for subkey in subkeywords:
                liketext += emotion + " LIKE '%" + subkey + "%' or "
            liketext = liketext[:-4]
            #liketext+=" limit 10000;"
            liketext += ";"
            db.connect()
            fields = queryexecutor.execute(query=liketext)
            db.disconnect()
            print("start analyzing sentences")'''
            toaddsents = []
            csv_file = open('csvs/' + keyword + '_' + emotion.lower() + '.csv',
                            mode='w',
                            encoding="utf8",
                            newline='\n')
            csv_file.close()
            csv_file = open('csvs/all_sentences/' + keyword + '_' +
                            emotion.lower() + '.csv',
                            mode='r',
                            encoding="utf8",
                            newline='\n')
            '''row_count = sum(1 for row in reader)
            print("number of sentences: " + str(len(row_count)))'''
            #print("number of reviews: "+str(len(fields)))
            reader = csv.reader(csv_file, delimiter='|', quotechar='"')
            i = 0
            allsents = []
            for row in reader:
                i += 1
                if i % 100 == 0:
                    print(str(i))
                allsents.append(row)
                if i % 1000 == 0: break
                #sent=row[2]
                '''lan = identifier.classify(sent)[0]
                acc = identifier.classify(sent)[1]
                if lan == 'en' and acc >= 0.9:
                    sentan = TextBlob(sent)
                    sent = (sentan.correct()).string
                    sentan = TextBlob(sent)
                    words = word_tokenize(sent)
                    skipsentence = True
                    for word in words:
                        if word.lower() in subkeywords:
                            skipsentence = False
                            break
                    if not skipsentence:
                        # tool = grammar_check.LanguageTool('en-GB')
                        # matches = tool.check(text)
                        sent = sent[0].upper() + sent[0 + 1:]
                        sent = sent.replace(' i ', ' I ')
                        #toaddsents.append([row[0],row[1],sent])
                        # sentan.sentiment_assessments
                        if sentan.polarity > 0.4 and sentan.subjectivity > 0.65:
                            toaddsents.append([row[0],row[1],sent])'''
            csv_file.close()
            print('num of reviews: ' + str(len(allsents)))
            counter = Value('i', 0)
            pool = mp.Pool(
                initializer=init_globals,
                processes=mp.cpu_count() * 2,
                initargs=(counter, ),
            )
            results = pool.map_async(thread_function_row_only,
                                     [row for row in allsents]).get()
            pool.close()
            pool.join()
            results = [r for r in results if r != None]
            '''l = chunkIt(allsents,2)
            ths = []
            threads = list()
            for index in range(2):
                li=l[index]
                x = threading.Thread(target=thread_function, args=(index,li,))
                threads.append(x)
                x.start()
            for index,thread in enumerate(threads):
                thread.join()'''
            '''for rew in fields:
                i+=1
                if i%10000==0:
                    print(str(i))
                text = rew[2]
                text = text.replace('\n', ' ')
                text = text.replace('\t', ' ')
                text = text.replace('\r', ' ')
                text = text.replace('.', '. ')
                text = text.replace(':', ': ')
                text = text.replace(';', '; ')
                text = text.replace(',', ', ')
                lan=identifier.classify(text)[0]
                acc=identifier.classify(text)[1]
                if lan == 'en' and acc >= 0.9:
                    tokenized_text = sent_tokenize(text)
                    for sent in tokenized_text:
                        sentan = TextBlob(sent)
                        sent = (sentan.correct()).string
                        sentan=TextBlob(sent)
                        words = word_tokenize(sent)
                        skipsentence = True
                        for word in words:
                            if word.lower() in subkeywords:
                                skipsentence = False
                                break
                        if not skipsentence:
                            # tool = grammar_check.LanguageTool('en-GB')
                            # matches = tool.check(text)
                            sent = sent[0].upper() + sent[0 + 1:]
                            sent = sent.replace(' i ', ' I ')
                            # sentan.ngrams(3)
                            # sentan.sentiment_assessments
                            if sentan.polarity > 0.4 and sentan.subjectivity > 0.65:
                                toaddsents.append([rew[0], rew[1], sent])
                tokenized_text = sent_tokenize(text)
                for sent in tokenized_text:
                    toaddsents.append([rew[0], rew[1], sent])'''
            print("start writing sentences")
            print("num sents: " + str(len(results)))
            i = 0
            csv_file = open('csvs/' + keyword + '_' + emotion.lower() + '.csv',
                            mode='a',
                            encoding="utf8",
                            newline='\n')
            for sen in results:
                i += 1
                if i % 100000 == 0:
                    print(str(i))
                writer = csv.writer(csv_file,
                                    delimiter='|',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow(sen)
            csv_file.close()
    print("done")
    print("--- %s seconds ---" % (time.time() - start_time))
Exemple #19
0
    def db_movie_save(self):

        db = db_connection()
        
        save_movie_doc(self.title, self.theater,self.room,self.time_offset,db)