def get_spells(request): conn = db_connection(request) cursor = conn.cursor() cursor.execute("SELECT * FROM spells;") result = cursor.fetchall() conn.close() return result
def do(originfile): f = open(originfile, "r") for line in f: liketextgood = '' liketextbad = '' kw = line[:-1] # important to have last line with \n fs = open( "subkeywords_booking/subkeywords_booking_cleaned/" + kw + ".txt", "r") l = [] for subkw in fs: l.append(subkw) if '\'' in subkw: subkw = subkw.replace('\'', "''") subkw = subkw[:-1] if subkw != '': liketextgood += ' or good LIKE \'%' + subkw + '%\'' liketextbad += ' or bad LIKE \'%' + subkw + '%\'' fs.close() liketextgood = liketextgood[4:] liketextgood += ';' liketextbad = liketextbad[4:] liketextbad += ';' db = db_connection() queryexecutor = db_operator(db) db.connect() query_good = 'CREATE VIEW ' + kw + '_good_view AS SELECT * FROM masterthesis.reviews where ' + liketextgood query_bad = 'CREATE VIEW ' + kw + '_bad_view AS SELECT * FROM masterthesis.reviews where ' + liketextbad print(query_bad) print(query_good) viewgood = queryexecutor.execute(query=query_good) viewbad = queryexecutor.execute(query=query_bad) db.disconnect() f.close()
def fetchall_cursor(fetchall_query): cursor = db_connection() cursor.execute(fetchall_query) row = cursor.fetchall() resp = jsonify(row) resp.status_code = 200 return resp
def mark_spell_complete(request, castTime): conn = db_connection(request) cursor = conn.cursor() result = _mark_spell_complete(cursor, castTime) conn.commit() conn.close() return result
def option_two(): documents = db_connection() menu.movies_registred() for dic in documents.find({}): movie = MovieFromDb(dic["name"], dic["theater"] ,dic["room"] ,dic["time"]) print(movie) exit()
def option_three(): documents = db_connection() title = validation.name_input(title="") find_document = documents.find_one({"name":f"{title}"},projection={"_id": False}) movie = MovieFromDb(find_document["name"], find_document["theater"] ,find_document["room"] ,find_document["time"]) menu.movies_registred() print(movie) exit()
def getRawCorpus(csv_file, id_and_country=False, additionaldetails=False, limit=-1, offset=0): raw_corpus = [] reader = csv.reader(csv_file, delimiter='|', quotechar='"') i = 0 if additionaldetails: db = db_connection() queryexecutor = db_operator(db) db.connect() for row in reader: if i % 50000 == 0 and i != 0: print('reading sentence ' + str(i)) i += 1 if i > offset and i <= offset + limit or limit == -1: id = row[0] query = 'SELECT HotelNumber, FamilyType FROM masterthesis.reviews WHERE ReviewID=' + id det = queryexecutor.execute(query=query) if len(det) <= 0: det = [('no_hotel_number'), ('no_family_type')] det = [det[0][0], det[0][1]] query = 'SELECT CountryID FROM masterthesis.hotels WHERE HotelNumber=' + str( det[0]) hot = queryexecutor.execute(query=query) if len(hot) <= 0: hot = [('no_country', )] hot = [hot[0][0]] det = det + hot raw_corpus.append(row + det) if i > offset + limit and limit > 0: break db.disconnect() return raw_corpus if id_and_country: for row in reader: if i % 50000 == 0 and i != 0: print('reading sentence ' + str(i)) i += 1 raw_corpus.append(row) else: for row in reader: if i % 50000 == 0 and i != 0: print('reading sentence ' + str(i)) i += 1 raw_corpus.append(row[2]) csv_file.close() return raw_corpus
def main(scrapestreetlinks=None, scrapepdflinks=None, download=None, parse=None, save=None): links = [] if scrapestreetlinks: links = get_bin_links() if scrapepdflinks: if not len(links): links = pd.read_csv(LINKSCSV) links['pdf_url'] = links['uri'].apply(get_collection_pdf_link) links.to_csv("binlinks.csv") if download: if not len(links): links = pd.read_csv(LINKSCSV) get_collection_pdfs(links) if parse: if not len(links): links = pd.read_csv(LINKSCSV) links['filename'] = links['pdf_url'].str.split("/").str[-1] times = parse_pdfs() bintimes = pd.merge(links, times, left_on='filename', right_on='filename', how='right').drop(columns=['Unnamed: 0']) bintimes.to_csv('bintimes.csv') if save: bintimes = pd.read_csv('bintimes.csv') bindicts = bintimes.to_dict(orient='records') for idx, bindict in enumerate(bindicts): datestr = bindict['date'] if not isinstance(datestr, str): datestr = datetime.datetime.strftime(bindict['date'], '%Y-%m-%d') bindict['_id'] = bindict['street'] + '_' + datestr bindict['_id'] = bindict['_id'].replace(' ', '_').lower() bindict['city'] = 'Edinburgh'.lower() bindicts[idx] = bindict print('upserting {} records'.format(len(bindicts))) bulk_upsert(db_connection(), 'bindays', 'days', bindicts)
def create_spell(request, params): conn = db_connection(request) cursor = conn.cursor() spellTime = int(time.time()) # We use spellTime as a primary key. So if we should happen to get two spells # at the same second, pretend like the second came a second later. while _get_spell_by_time(cursor, spellTime): spellTime += 1 try: assert(_authenticate_user(params['user_name'], params['spirit_animal'], cursor)) assert(isinstance(params['name'], basestring)) assert(params['name'] != '') assert(params['setup'] or params['loop']) for component in ['setup', 'loop']: if params[component]: for frame in params[component]: try: assert(validate_frame(frame)) except: log.debug(frame) raise AssertionError() except IOError(): return False setup = json.dumps(params['setup']) if params['setup'] else '' loop = json.dumps(params['loop']) if params['loop'] else '' cursor.execute('INSERT INTO spells VALUES (?,?,?,?,?,?,?)', ( params['user_name'], params['name'], 3, spellTime, setup, loop, 0 )) conn.commit() newSpell = _get_spell_by_time(cursor, spellTime) conn.close() return newSpell
from datetime import date, datetime app = Flask(__name__) app.config['CORS_HEADERS'] = 'Content-Type' cors = CORS(app, resources={r"*": {"origins": "*"}}) api = Api(app) parser = reqparse.RequestParser() parser.add_argument('term') HEADERS = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*' } conn = db_connection() class SearchStreets(Resource): def get(self): search_term = request.args.get('q') streets = search_streets(conn, str(search_term)) return output_json(streets, 200) class Street(Resource): def get(self, city, street): bindays = get_days(conn, city, street) return output_json(bindays, 200)
def cursor_request(*args): cursor = db_connection() cursor.execute(*args) mysql.connection.commit() cursor.close()
from flask import Flask, jsonify, render_template from db import db_connection app = Flask(__name__) db = db_connection() connection = db.connect_to_db(app) @app.route('/songs') def get_songs(): songs = db.get_songs(connection) print(songs) return render_template("songs.html", songs=songs) @app.route('/') def home(): return render_template("home.html") if __name__ == "__main__": print('Flask app is running...') app.run('0.0.0.0', '8080') print('...Flask app has stopped.')
def analyze(originfile, all=False): keywords = helper.getKeywords(originfile) os.chdir('./resources/stanford-corenlp-full-2018-10-05') os.system('kill $(lsof -t -i:9000)') cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &' time.sleep(4) print("starting nlp service") with open(os.devnull, "w") as f: subprocess.call(cmd, shell=True, stderr=f, stdout=f) time.sleep(4) print("nlp service started") os.chdir('../../') nlp_wrapper = StanfordCoreNLP('http://localhost:9000') print("Number of processors: ", mp.cpu_count()) if all: print("all") '''if not os.path.isfile('/resources/all_test.csv'): print("test file created") open('./resources/all_test.csv', 'w').close()''' conn = db.db_connection() dbo = db.db_operator(conn) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] '''for i in range(1790): print('i=' +str(i)) print("limit= 10000") print("offset= "+str(10000*i)) conn.connect() query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \ 'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \ 'FROM masterthesis.reviews, masterthesis.hotels ' \ 'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';' results = [list(x) for x in dbo.execute(query)]; conn.disconnect() print("got results from sql") print("starting analysis") print("tot number rows= " + str(len(results))) try: print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200) pool.close() pool.terminate() pool.join() print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) except TimeoutError: print("timeout error") pool.close() pool.terminate() pool.join() corpus_tok=[] for doc in results: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60) #print('pool close') pool.close() pool.terminate() #print('pool join') pool.join() except TimeoutError: print(str(doc)+" caused Exception") pool.close() pool.terminate() #print('pool join') pool.join() c=[None] corpus_tok.append(c[0]) print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] print('len corpus_tok_reduced= '+str(len(corpus_tok))) corpus_tok_all+=corpus_tok print('len corpus_tok_all= ' + str(len(corpus_tok_all))) if i%100==0 and i!=0: with open('./resources/all_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_all: writer.writerow(c) file.close() corpus_tok_all=[] ''' ''' corpus_tok_all=[] i=0 kk=set() with open('./resources/all_test.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) #if i%10000==0:break ar=((row[0].replace('[','')).replace(']','')).split(',') if ar[1][-1]!="'":#France, Metro. ar[1]=ar[1]+','+ar[2] for j in range(2,len(ar)-1): ar[j]=ar[j+1] del ar[len(ar)-1] ar[1]=ar[1][2:-1] ar[2] = (ar[2].replace("'", '')).replace(' ', '') rev=''.join(ar[3:]) revlist= ar[:3] revlist.append(rev) tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',') r=(revlist,tokens) k=ar[0] if k not in kk: kk.add(k) corpus_tok_all.append(r) file.close() corpus_tok=corpus_tok_all corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) lenc=len(corpus_tok) print("corpus_tok len = "+str(lenc)) for idx in range(lenc): if idx%100000==0: print(idx) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) with open('./resources/corpus_tok_all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(corpus_tok) file.close() print("corpus_tok written") from gensim.corpora import Dictionary print("writing frequence file") ''' '''all_set=set() for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if not (keyword == 'cleaning' or keyword=='pet'): start_time = time.time() print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True) # corpus = helper.getCorpusTextFromRaw(raw_corpus) spell = SpellChecker() counter = Value('i', 1) print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter, spell, nlp_wrapper,), ) corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get() print('pool close') pool.close() print('pool join') pool.join() print("beginning removal of sents with contrast") corpus_tok = [r for r in corpus_tok if r != None] ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # print('len all_set_tok before= ' + str(len(all_set))) print('len corpus_tok= ' + str(len(corpus_tok))) print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set))) for sen in corpus_tok: all_set.add((tuple(sen[0]),tuple(sen[1]))) print('len all_set_tok after= ' + str(len(all_set))) print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) # Compute bigrams. if len(all_set) > 0: corpus_tok=[(list(x[0]),list(x[1])) for x in all_set] corpustokonly = [r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) i = 0 for t in dictionary: i += 1 if i % 1000 == 0: print("analyzing token " + str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent += 1 freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)), alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i] = tuple(list(freq[i]) + [i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: f.write(str(item) + '\n') f.close() print("writing bow file") top_tokens = [f[1] for f in freq[:500]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * ( toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() ''' # Create a dictionary representation of the documents. '''dictionary = Dictionary(corpustokonly) alltok = [] freq = [] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus = len(corpus_tok) print("len dictionary = " + str(len(dictionary.keys()))) time.sleep(100000) counter = Value('i', 0) pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), ) print("pool initialized") corpustokonly=None alltok=None del corpustokonly, alltok freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get() pool.close() pool.terminate() pool.join() dictionary=None del dictionary global ctonly, dic, alltoks ctonly=None dic=None alltoks=None del ctonly,dic,alltoks print("frequence list len= "+str(len(freq))) print("frequence list created") freq.sort(key=lambda tup: tup[5], reverse=True) print("frequence list sorted") for i in range(len(freq)): if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) freq[i] = tuple(list(freq[i]) + [i]) print("frequence list modified") if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') i=0 ''' '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f: for item in freq: i+=1 if i%10000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) f.write(str(item) + '\n') f.close()''' corpus_tok=[] i=0 with open('./resources/corpus_tok_all.csv', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i%100000==0: print(i) corpus_tok.append(row) file.close() print("len corpus_tok= "+str(len(corpus_tok))) freq=[] i=0 with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file: reader = csv.reader(file, delimiter='|', quotechar='"') for row in reader: i+=1 if i==501:break freq.append(row) file.close() for i in range(len(freq)): freq[i]=freq[i][0] freq[i]=freq[i].replace("'",'') freq[i]=freq[i].replace('"','') freq[i]=freq[i].replace('(','') freq[i]=freq[i].replace(')','') freq[i]=freq[i].replace(' ','') freq[i]=freq[i].split(',') freq[i]=tuple(freq[i]) for i in range(len(corpus_tok)): if i%100000==0: print(i) corpus_tok[i][0]=corpus_tok[i][0].replace('[','') corpus_tok[i][0]=corpus_tok[i][0].replace(']','') det=(corpus_tok[i][0].split(',')) if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe det[1]=' '+'São Tomé and PrÃ\xadncipe'+' ' if det[1][-1]!="'":#France, Metro if 'Ivoire' in det[1]:#Cote d'Ivoire det[1]=det[1].replace('\\','') det[2]=det[2][1:] else: det[1]=det[1]+','+det[2] for j in range(2,len(det)-1): det[j]=det[j+1] del det[len(det)-1] det=det[:3] desc=(corpus_tok[i][0].split(','))[-1] det[0]=det[0][1:-1] det[1]=det[1][2:-1] det[2]=det[2][2:-1] desc=desc[3:-1] det.append(desc) corpus_tok[i][0]=det corpus_tok[i][1]=corpus_tok[i][1].replace("'",'') corpus_tok[i][1]=corpus_tok[i][1].replace(' ','') corpus_tok[i][1]=corpus_tok[i][1].replace('[','') corpus_tok[i][1]=corpus_tok[i][1].replace(']','') corpus_tok[i][1]=corpus_tok[i][1].split(',') print("writing bow file") top_tokens = [f[1] for f in freq[:400]] lentoptok = len(top_tokens) corpus_bow = {} toplen = 0 print("corpus_tok_len= "+str(len(corpus_tok))) for i in range(len(corpus_tok)): corpus_bow[i] = [0] * lentoptok if i%100000==0: print(i) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen: toplen = len(corpus_tok[i][0] + corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)] = 1 print("len corpus_bow keys= "+str(len(corpus_bow.keys()))) print("got corpus_bow") j=0 print("corpus_bow_len "+str(len(corpus_bow))) with open('resources/bow/all.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([''] * toplen + top_tokens) for i in corpus_bow.keys(): j+=1 if j%100000==0: print(j) print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) writer.writerow( corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i]) file.close() print("over") else: print("not all") for emotion in ['Good','Bad']: print("begin " + emotion) for keyword in list(keywords.keys()): if emotion=='Good' and keyword=='cleaning':#cleaning good start_time = time.time() print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) spell = SpellChecker() counter = Value('i', 1) corpus_tok_all=[] #if not os.path.isfile('/resources/cleaning_test.csv'): #open('./resources/cleaning_test.csv', 'w').close() for i in range(400):#400 print(str(i)) offset=i*1000 limit=1000 print("starting reading") print("limit="+str(limit)) print("offset="+str(offset)) raw_corpus = helper.getRawCorpus( csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset) #corpus = helper.getCorpusTextFromRaw(raw_corpus) #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)] #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):] print("starting analysis") pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) try: corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30) pool.close() pool.join() except TimeoutError: print("timeout error") print('pool close') pool.close() print('pool terminate') pool.terminate() print('pool join') pool.join() corpus_tok=[] for doc in raw_corpus: try: pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), ) c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30) #print('pool close') pool.close() #print('pool join') pool.join() '''thread = threading.Thread(target = thread_function_row_only, args = (doc)) thread.start() thread.join() c=que.get()''' except TimeoutError: print(str(doc)+" caused Exception") c=[None] corpus_tok.append(c[0]) corpus_tok_reduced=[r for r in corpus_tok if r != None] print("len corpus_tok: " + str(len(corpus_tok))) print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced))) '''with open('./resources/cleaning_test.csv', mode='a') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for c in corpus_tok_reduced: writer.writerow(c) file.close()''' corpus_tok_all+=corpus_tok_reduced print("len corpus_tok_all: " + str(len(corpus_tok_all))) ''' corpus_tok=[] s=0 for doc in corpus: newdoc=False doc = doc.lower() s += 1 if s % 10000 == 0: print(str(s)) for con in constr_conjs: if con in doc: newdoc=True break if not newdoc: toks = [spell.correction(tok['lemma']) for tok in nlp_wrapper.annotate(doc, properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[ 'sentences'][0]['tokens'] if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1] toapp = [] for i in range(len(toks)): if '/' in toks[i]: for tok in toks[i].split('/'): toapp.append(tok) for tok in toapp: toks.append(tok) toapp = [] for i in range(len(toks)): if '-' in toks[i]: for tok in toks[i].split('-'): toapp.append(tok) for tok in toapp: toks.append(tok) corpus_tok.append(toks)''' #print("beginning removal of sents with contrast") corpus_tok=corpus_tok_all print("len corpus_tok: " + str(len(corpus_tok))) ############################################################################### # We find bigrams in the documents. Bigrams are sets of two adjacent words. # Using bigrams we can get phrases like "machine_learning" in our output # (spaces are replaced with underscores); without bigrams we would only get # "machine" and "learning". # # Note that in the code below, we find bigrams and then add them to the # original data, because we would like to keep the words "machine" and # "learning" as well as the bigram "machine_learning". # # .. Important:: # Computing n-grams of large dataset can be very computationally # and memory intensive. # # Compute bigrams. if len(corpus_tok)>0: corpustokonly=[r[1] for r in corpus_tok] print("doing bigrams") # Add bigrams and trigrams to docs (only ones that appear 10 times or more). bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok)) for idx in range(len(corpus_tok)): for token in bigram[corpustokonly[idx]]: if '_' in token: # Token is a bigram, add to document. corpus_tok[idx][1].append(token) from gensim.corpora import Dictionary print("writing frequence file") # Create a dictionary representation of the documents. dictionary = Dictionary(corpustokonly) alltok = [] freq=[] for doc in corpustokonly: for tok in doc: alltok.append(tok) lencorpus=len(corpus_tok) print("len dictionary = "+str(len(dictionary.keys()))) i=0 for t in dictionary: i+=1 if i%1000==0: print("analyzing token "+str(i)) freqsent = 0 for doc in corpustokonly: if dictionary.get(t) in doc: freqsent+=1 freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus)) freq.sort(key=lambda tup: tup[5], reverse=True) for i in range(len(freq)): freq[i]=tuple(list(freq[i])+[i]) if not os.path.exists('resources/bow/allfreq/stanford/'): os.makedirs('resources/bow/allfreq/stanford/') with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f: for item in freq: f.write(str(item)+'\n') f.close() print("writing bow file") top_tokens=[f[1] for f in freq[:500]] lentoptok=len(top_tokens) corpus_bow={} toplen=0 for i in range(len(corpus_tok)): corpus_bow[i]=[0]*lentoptok if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen: toplen=len(corpus_tok[i][0]+corpus_tok[i][1]) for tok in corpus_tok[i][1]: if tok in top_tokens: corpus_bow[i][top_tokens.index(tok)]=1 with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['']*toplen+top_tokens) for i in corpus_bow.keys(): writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i]) file.close() print('------------------------------------------------------') print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion) f.close()
def build_country_indices(): db = db_connection() db.connect() queryexecutor = db_operator(db) query = 'select distinct(Country) from masterthesis.reviews;' print("retrieving countries of tourists") tourcountries = [x[0] for x in queryexecutor.execute(query=query)][1:] query = 'select distinct(CountryID) from masterthesis.hotels;' print("retrieving countries of hotels") hotcountries = [x[0] for x in queryexecutor.execute(query=query)] db.disconnect() special_countries = list() country_to_code = dict() #https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes country_to_code['Antigua & Barbuda'] = 'ag' country_to_code['Bonaire St Eustatius and Saba'] = 'bq' country_to_code['Cape Verde'] = 'cv' country_to_code['Central Africa Republic'] = 'cf' country_to_code['Cocos (K) I.'] = 'cc' country_to_code['Curaçao'] = 'cw' country_to_code['Democratic Republic of the Congo'] = 'cd' country_to_code['East Timor'] = 'tl' country_to_code['Equitorial Guinea'] = 'gq' country_to_code['France, Metro.'] = 'fr' country_to_code['Heard and McDonald Islands'] = 'hm' country_to_code['Laos'] = 'la' country_to_code['Netherlands Antilles'] = 'nt' country_to_code['North Korea'] = 'kp' country_to_code['Palestinian Territory'] = 'ps' country_to_code['Saint Vincent & Grenadines'] = 'vc' country_to_code['São Tomé and PrÃncipe'] = 'st' country_to_code['Serbia and Montenegro'] = 'em' country_to_code['South Korea'] = 'kr' country_to_code['St. Helena'] = 'sh' country_to_code['St. Pierre and Miquelon'] = 'pm' country_to_code['Svalbard & Jan Mayen'] = 'sj' country_to_code['Swaziland'] = 'sz' country_to_code['Turks & Caicos Islands'] = 'tc' country_to_code['U.K. Virgin Islands'] = 'vg' country_to_code['U.S. Virgin Islands'] = 'vi' country_to_code['U.S.A.'] = 'us' for key in country_to_code.keys(): special_countries.append(key) code_to_country = dict() for k, v in country_to_code.items(): code_to_country[v] = k for cont in tourcountries: try: code = pycountry.countries.search_fuzzy(cont)[0].alpha_2.lower() except Exception as e: None if code not in code_to_country: code_to_country[code] = cont if cont not in country_to_code: country_to_code[cont] = code for cont in hotcountries: cname = (pycountry.countries.get(alpha_2=cont.upper())).name if cont not in code_to_country.keys(): code_to_country[cont] = cname country_to_code[cname] = cont print("writing the indices") with open('resources/tourist_country_index.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in range(1, len(list(country_to_code.keys())) + 1): writer.writerow([i, list(country_to_code.keys())[i - 1]]) i += 1 writer.writerow([i, 'no_country']) i += 1 writer.writerow([i, '']) file.close() with open('resources/hotel_country_index.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in range(1, len(list(code_to_country.keys())) + 1): writer.writerow([i, list(code_to_country.keys())[i - 1]]) writer.writerow([i + 1, 'no_country']) file.close() with open('resources/country_to_code.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for key in [ x[0] for x in sorted(country_to_code.items(), key=operator.itemgetter(1), reverse=False) ]: writer.writerow([key, country_to_code[key]]) file.close() with open('resources/code_to_country.csv', mode='w') as file: writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for key in [ x[0] for x in sorted(code_to_country.items(), key=operator.itemgetter(0), reverse=False) ]: writer.writerow([key, code_to_country[key]]) file.close() print("writing countries indices over")
def get_spell_by_time(request, castTime): conn = db_connection(request) cursor = conn.cursor() spell = _get_spell_by_time(cursor, castTime) conn.close() return spell
def get_current_spells(request): conn = db_connection(request) cursor = conn.cursor() currentSpells = _get_current_spells(cursor) conn.close() return currentSpells
import mysql.connector import db mydb = db.db_connection("localhost", "root", "root", "python_practise") # mydb = mysql.connector.connect( # host="localhost", # user="******", # passwd="root", # database="python_practise" # ) mycursor = mydb.cursor() mycursor.execute("SELECT * from customer") myresult = mycursor.fetchall() mySingleResult = mycursor.fetchone() for x in myresult: print(x) print(mySingleResult)
def do(originfile): start_time = time.time() db = db_connection() queryexecutor = db_operator(db) keywords = {} f = open(originfile, "r") for line in f: keyword = line[:-1] # important to have last line with \n keywords[keyword] = [] fs = open( "subkeywords_booking/subkeywords_booking_cleaned/" + keyword + ".txt", "r") for linesub in fs: keywords[keyword].append( linesub[:-1]) # important to have last line with \n fs.close() f.close() #nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) #nlp.add_pipe(LanguageDetector(language_detection_function=custom_detection_function), name="language_detector",last=True) print("Number of processors: ", mp.cpu_count()) for emotion in ['Good', 'Bad']: print("begin " + emotion) for keyword in keywords.keys(): print(keyword) '''f=open('csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='w') f.close() liketext = 'SELECT ReviewID, Country, ' + emotion + ' from masterthesis.reviews where ' ''' subkeywords = keywords[keyword] '''for subkey in subkeywords: liketext += emotion + " LIKE '%" + subkey + "%' or " liketext = liketext[:-4] #liketext+=" limit 10000;" liketext += ";" db.connect() fields = queryexecutor.execute(query=liketext) db.disconnect() print("start analyzing sentences")''' toaddsents = [] csv_file = open('csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='w', encoding="utf8", newline='\n') csv_file.close() csv_file = open('csvs/all_sentences/' + keyword + '_' + emotion.lower() + '.csv', mode='r', encoding="utf8", newline='\n') '''row_count = sum(1 for row in reader) print("number of sentences: " + str(len(row_count)))''' #print("number of reviews: "+str(len(fields))) reader = csv.reader(csv_file, delimiter='|', quotechar='"') i = 0 allsents = [] for row in reader: i += 1 if i % 100 == 0: print(str(i)) allsents.append(row) if i % 1000 == 0: break #sent=row[2] '''lan = identifier.classify(sent)[0] acc = identifier.classify(sent)[1] if lan == 'en' and acc >= 0.9: sentan = TextBlob(sent) sent = (sentan.correct()).string sentan = TextBlob(sent) words = word_tokenize(sent) skipsentence = True for word in words: if word.lower() in subkeywords: skipsentence = False break if not skipsentence: # tool = grammar_check.LanguageTool('en-GB') # matches = tool.check(text) sent = sent[0].upper() + sent[0 + 1:] sent = sent.replace(' i ', ' I ') #toaddsents.append([row[0],row[1],sent]) # sentan.sentiment_assessments if sentan.polarity > 0.4 and sentan.subjectivity > 0.65: toaddsents.append([row[0],row[1],sent])''' csv_file.close() print('num of reviews: ' + str(len(allsents))) counter = Value('i', 0) pool = mp.Pool( initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter, ), ) results = pool.map_async(thread_function_row_only, [row for row in allsents]).get() pool.close() pool.join() results = [r for r in results if r != None] '''l = chunkIt(allsents,2) ths = [] threads = list() for index in range(2): li=l[index] x = threading.Thread(target=thread_function, args=(index,li,)) threads.append(x) x.start() for index,thread in enumerate(threads): thread.join()''' '''for rew in fields: i+=1 if i%10000==0: print(str(i)) text = rew[2] text = text.replace('\n', ' ') text = text.replace('\t', ' ') text = text.replace('\r', ' ') text = text.replace('.', '. ') text = text.replace(':', ': ') text = text.replace(';', '; ') text = text.replace(',', ', ') lan=identifier.classify(text)[0] acc=identifier.classify(text)[1] if lan == 'en' and acc >= 0.9: tokenized_text = sent_tokenize(text) for sent in tokenized_text: sentan = TextBlob(sent) sent = (sentan.correct()).string sentan=TextBlob(sent) words = word_tokenize(sent) skipsentence = True for word in words: if word.lower() in subkeywords: skipsentence = False break if not skipsentence: # tool = grammar_check.LanguageTool('en-GB') # matches = tool.check(text) sent = sent[0].upper() + sent[0 + 1:] sent = sent.replace(' i ', ' I ') # sentan.ngrams(3) # sentan.sentiment_assessments if sentan.polarity > 0.4 and sentan.subjectivity > 0.65: toaddsents.append([rew[0], rew[1], sent]) tokenized_text = sent_tokenize(text) for sent in tokenized_text: toaddsents.append([rew[0], rew[1], sent])''' print("start writing sentences") print("num sents: " + str(len(results))) i = 0 csv_file = open('csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='a', encoding="utf8", newline='\n') for sen in results: i += 1 if i % 100000 == 0: print(str(i)) writer = csv.writer(csv_file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(sen) csv_file.close() print("done") print("--- %s seconds ---" % (time.time() - start_time))
def db_movie_save(self): db = db_connection() save_movie_doc(self.title, self.theater,self.room,self.time_offset,db)