コード例 #1
0
def saveweightedtopspersent(originfile):
    keywords = helper.getKeywords(originfile)
    for emotion in ['Good', 'Bad']:
        print("begin " + emotion)
        for keyword in keywords.keys():
            print(keyword)
            path = 'resources/gensim/noadj/not_cleaned/' + keyword + '_' + emotion.lower(
            ) + '/' + keyword + '_' + emotion.lower()
            try:
                lda = LdaModel.load(path)
                raw_corpus = helper.getRawCorpus(
                    csv_file=open('resources/csvs/' + keyword + '_' +
                                  emotion.lower() + '.csv',
                                  mode='r',
                                  encoding="utf8",
                                  newline='\n'),
                    id_and_country=True,
                    additionaldetails=True)
                stopwords = getStopwords(stopset)
                stwfromtfidf = list(
                    TfidfVectorizer(stop_words='english').get_stop_words())
                stopwords = set(list(stopwords) + stwfromtfidf)
                for w in negationstopset:
                    stopwords.add(w)
                bow, dictionary, corpus, raw_corpus = documentprocessor.fullpreprocessrawcorpustobow(
                    raw_corpus, stopwords, min_count_bigrams=20)

                if not os.path.exists(
                        'resources/gensim/noadj/outputtopsdocs/'):
                    os.makedirs('resources/gensim/noadj/outputtopsdocs/')
                if not os.path.exists(
                        'resources/gensim/noadj/outputtopsdocs/' + keyword +
                        '_' + emotion.lower() + '/'):
                    os.makedirs('resources/gensim/noadj/outputtopsdocs/' +
                                keyword + '_' + emotion.lower() + '/')
                csv_file = open('resources/gensim/noadj/outputtopsdocs/' +
                                keyword + '_' + emotion.lower() + '/' +
                                keyword + '_' + emotion.lower() + '.csv',
                                mode='w',
                                encoding="utf8",
                                newline='\n')
                i = 0
                for val in lda.get_document_topics(bow):
                    s = [corpus[i], val]
                    writer = csv.writer(csv_file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow(raw_corpus[i] + s)
                    i += 1
                csv_file.close()
            except Exception as e:
                print(e)
コード例 #2
0
def dividebynation(originfile):
    keywords = helper.getKeywords(originfile)
    for emotion in ['Good', 'Bad']:
        print("begin " + emotion)
        for keyword in keywords.keys():
            print(keyword)
            nationcluster = {}
            try:
                csv_file = open('resources/gensim/noadj/outputtopsdocs/' +
                                keyword + '_' + emotion.lower() + '/' +
                                keyword + '_' + emotion.lower() + '.csv',
                                mode='r',
                                encoding="utf8",
                                newline='\n')
                reader = csv.reader(csv_file, delimiter='|', quotechar='"')
                for row in reader:
                    nat = row[1]
                    if nat not in nationcluster.keys():
                        nationcluster[nat] = []
                    nationcluster[nat].append(row)
                for nat in nationcluster.keys():
                    if not os.path.exists(
                            'resources/gensim/noadj/outputtopsdocs/' +
                            keyword + '_' + emotion.lower() + '/bycountry/'):
                        os.makedirs('resources/gensim/noadj/outputtopsdocs/' +
                                    keyword + '_' + emotion.lower() +
                                    '/bycountry/')
                    csv_file = open('resources/gensim/noadj/outputtopsdocs/' +
                                    keyword + '_' + emotion.lower() +
                                    '/bycountry/' + nat + '.csv',
                                    mode='w',
                                    encoding="utf8",
                                    newline='\n')
                    writer = csv.writer(csv_file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    for r in nationcluster[nat]:
                        writer.writerow(r)
                    csv_file.close()
            except:
                None
コード例 #3
0
 def do(self, originfile, tf, includingkeword, negation):
     keywords = helper.getKeywords(originfile)
     # Create stopword list:
     stopwords = self.getStopwords(self.stopset)
     stwfromtfidf = list(
         TfidfVectorizer(stop_words='english').get_stop_words())
     stopwords = set(list(stopwords) + stwfromtfidf)
     if negation == 'withnegation':
         for w in self.negationstopset:
             stopwords.add(w)
     elif negation == 'nonegation':
         for w in self.negationstopset:
             try:
                 stopwords.remove(w)
             except:
                 None  #word already not in stopwords
     for emotion in ['Good', 'Bad']:
         print("begin " + emotion)
         for keyword in list(keywords.keys()):
             print(keyword)
             raw_corpus = helper.getRawCorpus(
                 csv_file=open('resources/csvs/' + keyword + '_' +
                               emotion.lower() + '.csv',
                               mode='r',
                               encoding="utf8",
                               newline='\n'),
                 id_and_country=True)
             raw_corpus, corpus = helper.preprocessRawCorpus(
                 raw_corpus, thresholdcountpernation=100)
             self.doKaggle(corpus,
                           stopwords,
                           keyword,
                           emotion,
                           tfname=tf,
                           includingkeywordname=includingkeword,
                           negationname=negation)
             # self.doBasicGensim(originfile,corpus)
             # self.doTWds(originfile,corpus)
             '''try:
コード例 #4
0
def filterallsep(originfile, toptokens=False, all=False):
    keywords = helper.getKeywords(originfile)
    old_cont_index = indexmanager.get_hotel_country_index()
    old_tok_index = indexmanager.get_token_index()
    if not all:
        lkwds = list(keywords.keys())
        frequencecell = 9
        topnrange = range(10, 51)
        target_dir = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/topntokens/' + keyword + '/'
    else:
        lkwds = ['all']
        frequencecell = 7
        topnrange = range(100, 101)
        res_file = 'resources/bow/tourist_hotel_country_freq/diff/all.csv'
        target_dir = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/'
    if toptokens:
        for keyword in lkwds:
            start_time = time.time()
            for topn in topnrange:
                if not all:
                    res_file = './resources/bow/tourist_hotel_country_freq/diff/topntokens/' + keyword + '/' + keyword + '_top_' + str(
                        topn) + '_tokens.csv'
                    target_file = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/topntokens/' + keyword + '/' + keyword + '_top_' + str(
                        topn) + '_tokens.csv'
                goforward = True
                lines = []
                tokens = set()
                try:
                    with open(res_file) as csv_file:
                        csv_reader = csv.reader(csv_file, delimiter='|')
                        header = next(csv_reader)
                        for row in csv_reader:
                            lines.append([
                                row[0], row[2], row[4], row[5],
                                row[frequencecell]
                            ])
                            tokens.add(row[5])
                    # if len(origins_to_dect[x])>=5
                    # print(len([x for x in origins_to_dect.keys()]))
                    csv_file.close()
                except Exception as e:
                    goforward = False
                if goforward:
                    tokens = list(tokens)
                    token_index = dict()
                    old_tok_to_new = dict()
                    for i in range(1, len(tokens) + 1):
                        token_index[i] = old_tok_index['index_to_token'][int(
                            tokens[i - 1])]
                        old_tok_to_new[int(tokens[i - 1])] = i
                    if not os.path.exists(target_dir):
                        os.makedirs(target_dir)
                    with open(target_file, mode='w') as file:
                        writer = csv.writer(file,
                                            delimiter='|',
                                            quotechar='"',
                                            quoting=csv.QUOTE_MINIMAL)
                        writer.writerow([
                            'country_origin_index',
                            'country_destination_index',
                            'number unique reviews', 'token_index',
                            'frequence_difference'
                        ])
                        for line in lines:
                            newline = [
                                int(line[0]),
                                int(line[1]),
                                int(line[2]), old_tok_to_new[int(line[3])],
                                line[4]
                            ]
                            writer.writerow(newline)
                    file.close()
                    if not all:
                        target_file_token_index = 'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/topntokens/' + keyword + '/' + keyword + '_top_' + str(
                            topn) + '_tokens_token_index.csv'
                    with open(target_file_token_index, mode='w') as file:
                        writer = csv.writer(file,
                                            delimiter='|',
                                            quotechar='"',
                                            quoting=csv.QUOTE_MINIMAL)
                        writer.writerow(['token_index', 'token'])
                        for key in token_index.keys():
                            writer.writerow([key, token_index[key]])
                    file.close()
                print('------------------------------------------------------')
                print(
                    str(time.time() - start_time) + ' seconds to filter ' +
                    keyword + ',top ' + str(topn) + ' tokens')
    else:
        for keyword in lkwds:
            try:
                lines = []
                combs = dict()
                origins_to_dect = dict()
                goforward = True
                start_time = time.time()
                tokens = set()
                with open(res_file) as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter='|')
                    header = next(csv_reader)
                    for row in csv_reader:
                        if int(row[4]) >= 0 and row[1] != '' and row[
                                1] != 'no_country' and row[3] != 'no_country':
                            lines.append([
                                row[0], row[2], row[4], row[5],
                                row[frequencecell]
                            ])
                            if (row[0], row[2]) not in combs.keys():
                                combs[(row[0], row[2])] = set()
                            if row[0] not in origins_to_dect.keys():
                                origins_to_dect[row[0]] = set()
                            origins_to_dect[row[0]].add(row[2])
                            combs[(row[0], row[2])].add(row[5])
                            tokens.add(row[5])
                # if len(origins_to_dect[x])>=5
                # print(len([x for x in origins_to_dect.keys()]))
                csv_file.close()
            except Exception as e:
                goforward = False
            if goforward:
                countries = set(origins_to_dect.keys()).union(
                    set.union(*[x for x in origins_to_dect.values()]))
                countries = list(countries)
                country_index = dict()
                old_cont_to_new = dict()
                for i in range(1, len(countries) + 1):
                    country_index[i] = old_cont_index['index_to_country'][int(
                        countries[i - 1])]
                    old_cont_to_new[int(countries[i - 1])] = i
                tokens = list(tokens)
                token_index = dict()
                old_tok_to_new = dict()
                for i in range(1, len(tokens) + 1):
                    token_index[i] = old_tok_index['index_to_token'][int(
                        tokens[i - 1])]
                    old_tok_to_new[int(tokens[i - 1])] = i
                if not os.path.exists(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/'
                ):
                    os.makedirs(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/'
                    )
                print("starting writing csv")
                with open(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/'
                        + keyword + '.csv',
                        mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow([
                        'country_origin_index', 'country_destination_index',
                        'number unique reviews', 'token_index',
                        'frequence_difference'
                    ])
                    print("len lines= " + str(len(lines)))
                    i = 0
                    for line in lines:
                        i += 1
                        if i % 100000 == 0:
                            print(i)
                        newline = [
                            old_cont_to_new[int(line[0])],
                            old_cont_to_new[int(line[1])],
                            int(line[2]), old_tok_to_new[int(line[3])], line[4]
                        ]
                        writer.writerow(newline)
                file.close()
                print("starting writing country index")
                with open(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/'
                        + keyword + '_country_index.csv',
                        mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow(['country_index', 'country'])
                    for key in country_index.keys():
                        writer.writerow([key, country_index[key]])
                file.close()
                print("starting writing token index")
                with open(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/all_separetely/'
                        + keyword + '_token_index.csv',
                        mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow(['token_index', 'token'])
                    for key in token_index.keys():
                        writer.writerow([key, token_index[key]])
                file.close()
            print('------------------------------------------------------')
            print(
                str(time.time() - start_time) + ' seconds to filter ' +
                keyword)
コード例 #5
0
def build_association_count_list(originfile):
    lines = []
    lines_reduced = []
    ass = dict()
    keywords = helper.getKeywords(originfile)
    combs = dict()
    ass_reduced = dict()
    combs_reduced = dict()
    for keyword in list(keywords.keys()):
        if keyword in ['breakfast', 'bedroom', 'bathroom', 'location']:
            ass[keyword] = dict()
            combs[keyword] = dict()
            ass_reduced[keyword] = dict()
            combs_reduced[keyword] = dict()
            with open(
                    'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/'
                    + keyword + '.csv') as csv_file:
                csv_reader = csv.reader(csv_file, delimiter='|')
                next(csv_reader)
                for row in csv_reader:
                    if row[0] not in ass[keyword].keys():
                        ass[keyword][row[0]] = set()
                    ass[keyword][row[0]].add(row[1])
                    if (row[0], row[1]) not in combs[keyword].keys():
                        combs[keyword][(row[0], row[1])] = set()
                    combs[keyword][(row[0], row[1])].add(row[2])
            csv_file.close()
            with open(
                    'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/'
                    + keyword + '.csv') as csv_file:
                csv_reader = csv.reader(csv_file, delimiter='|')
                next(csv_reader)
                for row in csv_reader:
                    if row[0] not in ass_reduced[keyword].keys():
                        ass_reduced[keyword][row[0]] = set()
                    ass_reduced[keyword][row[0]].add(row[1])
                    if (row[0], row[1]) not in combs_reduced[keyword].keys():
                        combs_reduced[keyword][(row[0], row[1])] = set()
                    combs_reduced[keyword][(row[0], row[1])].add(row[2])
            csv_file.close()
    b = True
    v = set([[k for k in ass[keyword].keys()] for keyword in ass.keys()][0])
    for o in [[k for k in ass[keyword].keys()] for keyword in ass.keys()]:
        if set(o) != v:
            b = False
            break
    lines.append('all possible origins are the same number and the same = ' +
                 str(b) + '\n')
    v = [[ass[keyword][k] for k in ass[keyword].keys()]
         for keyword in ass.keys()][0][0]
    b = True
    for d in [[ass[keyword][k] for k in ass[keyword].keys()]
              for keyword in ass.keys()]:
        for dd in d:
            if dd != v:
                b = False
                break
    lines.append(
        'all possible destinations are the same number and the same = ' +
        str(b) + '\n')
    lines.append('all origins are: ' +
                 str(set([k for k in ass['breakfast'].keys()])) + '\n')
    lines.append('all destinations are: ' +
                 str([ass['breakfast'][k]
                      for k in ass['breakfast'].keys()][0]) + '\n')
    for keyword in ['breakfast', 'bedroom', 'bathroom', 'location']:
        b = True
        toksetz = [combs[keyword][c] for c in combs[keyword].keys()][0]
        for tokset in [combs[keyword][c] for c in combs[keyword].keys()]:
            if tokset != toksetz:
                b = False
                break
        lines.append(
            "for concept " + keyword +
            ', for every combination origin/destination, all the tokens are the same = '
            + str(b) + '\n')
        lines.append(
            "for concept " + keyword +
            ' the length of the list of tokens for the first combination origin/destination is '
            + str(len(toksetz)) + '\n')

    b = True
    v = set([[k for k in ass_reduced[keyword].keys()]
             for keyword in ass_reduced.keys()][0])
    for o in [[k for k in ass_reduced[keyword].keys()]
              for keyword in ass_reduced.keys()]:
        if set(o) != v:
            b = False
            break
    lines_reduced.append(
        'all possible origins are the same number and the same = ' + str(b) +
        '\n')
    v = [[ass_reduced[keyword][k] for k in ass_reduced[keyword].keys()]
         for keyword in ass_reduced.keys()][0][0]
    b = True
    for d in [[ass_reduced[keyword][k] for k in ass_reduced[keyword].keys()]
              for keyword in ass_reduced.keys()]:
        for dd in d:
            if dd != v:
                b = False
                break
    lines_reduced.append(
        'all possible destinations are the same number and the same = ' +
        str(b) + '\n')
    lines_reduced.append('all origins are: ' +
                         str(set([k
                                  for k in ass_reduced['breakfast'].keys()])) +
                         '\n')
    lines_reduced.append('all destinations are: ' + str(
        [ass_reduced['breakfast'][k]
         for k in ass_reduced['breakfast'].keys()][0]) + '\n')
    for keyword in ['breakfast', 'bedroom', 'bathroom', 'location']:
        b = True
        toksetz = [
            combs_reduced[keyword][c] for c in combs_reduced[keyword].keys()
        ][0]
        for tokset in [
                combs_reduced[keyword][c]
                for c in combs_reduced[keyword].keys()
        ]:
            if tokset != toksetz:
                b = False
                break
        lines_reduced.append(
            "for concept " + keyword +
            ', for every combination origin/destination, all the tokens are the same = '
            + str(b) + '\n')
        lines_reduced.append(
            "for concept " + keyword +
            ' the length of the list of tokens for the first combination origin/destination is '
            + str(len(toksetz)) + '\n')

    file = open(
        'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/report.txt',
        'w')
    file.writelines(lines)
    file.close()
    file = open(
        'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/report.txt',
        'w')
    file.writelines(lines_reduced)
    file.close()
コード例 #6
0
def filter(originfile):
    keywords = helper.getKeywords(originfile)
    countries = dict()
    tokens = dict()
    countries['origin'] = dict()
    countries['destination'] = dict()
    lines_dict = dict()
    lines_reduced_dict = dict()
    intersect_tokens = set()
    intersect_countries_origin = set()
    intersect_countries_dest = set()
    validkeywords = []
    ass_count_count = dict()
    ass_count_count['origin_tourist'] = dict()
    ass_count_count['destination_hotel'] = dict()
    k_values = dict()
    k_values['breakfast'] = 6
    k_values['bedroom'] = 5
    k_values['bathroom'] = 4
    k_values['location'] = 13
    for keyword in list(keywords.keys()):
        if keyword in ['breakfast', 'bedroom', 'bathroom', 'location']:
            ass_count_count['origin_tourist'][keyword] = dict()
            ass_count_count['destination_hotel'][keyword] = dict()
            countries['origin'][keyword] = set()
            countries['destination'][keyword] = set()
            tokens[keyword] = set()
            start_time = time.time()
            goforward = True
            print(keyword + ' ---- ' +
                  time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            lines = []
            lines_reduced = []
            try:
                with open('resources/bow/tourist_hotel_country_freq/diff/' +
                          keyword + '.csv') as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter='|')
                    next(csv_reader)
                    for row in csv_reader:
                        if int(row[4]) >= 100 and row[1] != '' and row[
                                1] != 'no_country' and row[3] != 'no_country':
                            lines.append([row[0], row[2], row[5], row[9]])
                            countries['origin'][keyword].add(row[0])
                            countries['destination'][keyword].add(row[2])
                            tokens[keyword].add(row[5])
                        if int(row[4]) >= 20 and row[1] != '' and row[
                                1] != 'no_country' and row[3] != 'no_country':
                            lines_reduced.append(
                                [row[0], row[2], row[5], row[9]])
                csv_file.close()
            except Exception as e:
                goforward = False
            if goforward:
                validkeywords.append(keyword)
                lines_dict[keyword] = lines
                lines_reduced_dict[keyword] = lines_reduced
                if len(list(intersect_tokens)) == 0:
                    intersect_tokens = tokens[keyword]
                if len(list(intersect_countries_origin)) == 0:
                    intersect_countries_origin = countries['origin'][keyword]
                if len(list(intersect_countries_dest)) == 0:
                    intersect_countries_dest = countries['destination'][
                        keyword]
                intersect_tokens = intersect_tokens.intersection(
                    tokens[keyword])
                intersect_countries_origin = intersect_countries_origin.intersection(
                    countries['origin'][keyword])
                intersect_countries_dest = intersect_countries_dest.intersection(
                    countries['destination'][keyword])

                ass_sep = dict()
                for line in lines:
                    if line[0] not in ass_sep.keys():
                        ass_sep[line[0]] = set()
                    ass_sep[line[0]].add(line[1])

                k = k_values[keyword]
                destinations_sep = set.intersection(*[
                    ass_sep[key] for key in ass_sep.keys()
                    if len(ass_sep[key]) >= k
                ])
                origins_sep = set([
                    key for key in ass_sep.keys()
                    if ass_sep[key] >= (destinations_sep)
                ])
                newdestinations_sep = set.intersection(
                    *[ass_sep[k] for k in origins_sep])
                if not os.path.exists(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/concept_separetely/'
                ):
                    os.makedirs(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/concept_separetely/'
                    )
                with open(
                        'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/concept_separetely/'
                        + keyword + '.csv',
                        mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow([
                        'country_origin_index', 'country_destination_index',
                        'token_index', 'frequence_difference'
                    ])
                    for line in lines_dict[keyword]:
                        if line[0] in origins_sep and line[
                                1] in newdestinations_sep:
                            writer.writerow(line)
                file.close()
            print('------------------------------------------------------')
            print(
                str(time.time() - start_time) + ' seconds to filter ' +
                keyword)
    if not os.path.exists(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/'):
        os.makedirs('resources/bow/tourist_hotel_country_freq/diff/filtered/')
    if not os.path.exists(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/'
    ):
        os.makedirs(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/')
    if not os.path.exists(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/'
    ):
        os.makedirs(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/'
        )
    lines = [[line for line in lines_dict[keyword]]
             for keyword in validkeywords]
    lines_reduced = [[line for line in lines_reduced_dict[keyword]]
                     for keyword in validkeywords]
    ass = dict()
    ass_reduced = dict()
    tokens = set()
    for line in lines:
        ass[lines.index(line)] = dict()
        for l in line:
            if l[0] not in ass[lines.index(line)].keys():
                ass[lines.index(line)][l[0]] = set()
            ass[lines.index(line)][l[0]].add(l[1])
    tokens_reduced = set()
    for line in lines_reduced:
        ass_reduced[lines_reduced.index(line)] = dict()
        for l in line:
            if l[0] not in ass_reduced[lines_reduced.index(line)].keys():
                ass_reduced[lines_reduced.index(line)][l[0]] = set()
            ass_reduced[lines_reduced.index(line)][l[0]].add(l[1])
    k = 7
    destinations = set.intersection(*[
        set.intersection(*[
            ass[keyword][key] for key in ass[keyword].keys()
            if len(ass[keyword][key]) >= k
        ]) for keyword in ass.keys()
    ])
    origins = set.intersection(*[
        set([
            key for key in ass[keyword].keys()
            if ass[keyword][key] >= (destinations)
        ]) for keyword in ass.keys()
    ])
    newdestinations = set.intersection(*[
        set.intersection(*[ass[keyword][k] for k in origins])
        for keyword in ass.keys()
    ])
    for keyword in lines_dict.keys():
        for line in lines_dict[keyword]:
            if line[0] in origins and line[1] in newdestinations:
                tokens.add(line[2])
    k = 12
    destinations_reduced = set.intersection(*[
        set.intersection(*[
            ass_reduced[keyword][key] for key in ass_reduced[keyword].keys()
            if len(ass_reduced[keyword][key]) >= k
        ]) for keyword in ass_reduced.keys()
    ])
    origins_reduced = set.intersection(*[
        set([
            key for key in ass_reduced[keyword].keys()
            if ass_reduced[keyword][key] >= (destinations_reduced)
        ]) for keyword in ass_reduced.keys()
    ])
    newdestinations_reduced = set.intersection(*[
        set.intersection(*[ass_reduced[keyword][k] for k in origins_reduced])
        for keyword in ass.keys()
    ])
    for keyword in lines_reduced_dict.keys():
        for line in lines_reduced_dict[keyword]:
            if line[0] in origins_reduced and line[
                    1] in newdestinations_reduced:
                tokens_reduced.add(line[2])
    token_index = dict()
    country_index = dict()
    token_index_reduced = dict()
    country_index_reduced = dict()
    old_cont_index = indexmanager.get_hotel_country_index()
    old_tok_index = indexmanager.get_token_index()
    country_list = list(newdestinations.union(origins))
    old_cont_to_new = dict()
    old_tok_to_new = dict()
    old_cont_to_new_reduced = dict()
    old_tok_to_new_reduced = dict()
    tokenlist = list(tokens)
    tokenlist_reduced = list(tokens_reduced)
    country_list_reduced = list(newdestinations_reduced.union(origins_reduced))
    for i in range(1, len(country_list) + 1):
        country_index[i] = old_cont_index['index_to_country'][int(
            country_list[i - 1])]
        old_cont_to_new[int(country_list[i - 1])] = i
    for i in range(1, len(tokenlist) + 1):
        token_index[i] = old_tok_index['index_to_token'][int(tokenlist[i - 1])]
        old_tok_to_new[int(tokenlist[i - 1])] = i
    for i in range(1, len(country_list_reduced) + 1):
        country_index_reduced[i] = old_cont_index['index_to_country'][int(
            country_list_reduced[i - 1])]
        old_cont_to_new_reduced[int(country_list_reduced[i - 1])] = i
    for i in range(1, len(tokenlist) + 1):
        token_index_reduced[i] = old_tok_index['index_to_token'][int(
            tokenlist_reduced[i - 1])]
        old_tok_to_new_reduced[int(tokenlist_reduced[i - 1])] = i
    with open(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/token_index.csv',
            mode='w') as file:
        writer = csv.writer(file,
                            delimiter='|',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for k in sorted(list(token_index.keys())):
            writer.writerow([k, token_index[k]])
    file.close()
    with open(
            'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/country_index.csv',
            mode='w') as file:
        writer = csv.writer(file,
                            delimiter='|',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for k in sorted(list(country_index.keys())):
            writer.writerow([k, country_index[k]])
    file.close()
    for keyword in validkeywords:
        with open('resources/bow/tourist_hotel_country_freq/diff/filtered/' +
                  keyword + '.csv',
                  mode='w') as file:
            writer = csv.writer(file,
                                delimiter='|',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([
                'country_origin_index', 'country_destination_index',
                'token_index', 'frequence_difference'
            ])
            for line in lines_dict[keyword]:
                if line[0] in intersect_countries_origin and line[
                        1] in intersect_countries_dest:
                    writer.writerow(line)
        file.close()
        with open(
                'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/'
                + keyword + '.csv',
                mode='w') as file:
            writer = csv.writer(file,
                                delimiter='|',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([
                'country_origin_index', 'country_destination_index',
                'token_index', 'frequence_difference'
            ])
            for line in lines_dict[keyword]:
                if line[0] in origins and line[1] in newdestinations:
                    newline = [
                        old_cont_to_new[int(line[0])],
                        old_cont_to_new[int(line[1])],
                        old_tok_to_new[int(line[2])], line[3]
                    ]
                    writer.writerow(newline)
        with open(
                'resources/bow/tourist_hotel_country_freq/diff/filtered/withcomb/reduced/'
                + keyword + '.csv',
                mode='w') as file:
            writer = csv.writer(file,
                                delimiter='|',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([
                'country_origin_index', 'country_destination_index',
                'token_index', 'frequence_difference'
            ])
            for line in lines_reduced_dict[keyword]:
                if line[0] in origins_reduced and line[
                        1] in newdestinations_reduced:
                    writer.writerow(line)
        file.close()
コード例 #7
0
def do(originfile, all=False, common_tokens=True):
    if all:
        tokenset = set()
        alltable = read_table_all(
            'resources/bow/tourist_hotel_country_freq/all.csv')
        diff_table = {}
        for countries in alltable.keys():
            diff_table[countries] = {}
            diff_table[countries]['tokens'] = {}
            diff_table[countries]['unique_reviews'] = alltable[countries][
                'unique_reviews']
            diff_table[countries]['count_rev'] = len(
                list(diff_table[countries]['unique_reviews']))
            for tok in alltable[countries]['tokens'].keys():
                tokenset.add(tok)
                diff_table[countries]['tokens'][tok] = {}
                diff_table[countries]['tokens'][tok]['diff'] = alltable[
                    countries]['tokens'][tok]
        indexmanager.update_token_index(tokenset)
        print("start writing difference matrix for all matrix")
        country_ind = indexmanager.get_hotel_country_index()
        if not os.path.exists(
                'resources/bow/tourist_hotel_country_freq/diff/'):
            os.makedirs('resources/bow/tourist_hotel_country_freq/diff/')
        with open('resources/bow/tourist_hotel_country_freq/diff/all.csv',
                  mode='w') as file:
            writer = csv.writer(file,
                                delimiter='|',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([
                'Tourist_Country_Index', 'Tourist_Country',
                'Hotel_Country_Index', 'Hotel_Country',
                'Total number of unique reviews', 'Token_Index', 'Token',
                'Token_Frequence'
            ])
            token_index = indexmanager.get_token_index()
            print("num_comb_countries= " + str(len(diff_table.keys())))
            i = 0
            for countries in diff_table.keys():
                i += 1
                if i % 1000 == 0:
                    print(
                        str(i) + '  ' +
                        str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())))
                for tok in diff_table[countries]['tokens'].keys():
                    writer.writerow([
                        country_ind['country_to_index'][countries[0]],
                        countries[0],
                        country_ind['country_to_index'][countries[1]],
                        countries[1], diff_table[countries]['count_rev'],
                        token_index['token_to_index'][tok], tok,
                        "{:.15f}".format(
                            diff_table[countries]['tokens'][tok]['diff'])
                    ])

        file.close()
        print("over. written difference file")
    else:
        keywords = helper.getKeywords(originfile)
        tokenset = set()
        diff_tables = {}
        diff_tables_topntokens = {}
        validkeywords = []
        for keyword in keywords.keys():
            start_time = time.time()
            goforward = True
            print(keyword + ' ---- ' +
                  time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            try:
                good_tab = read_table(
                    'resources/bow/tourist_hotel_country_freq/' + keyword +
                    '_good.csv')
                bad_table = read_table(
                    'resources/bow/tourist_hotel_country_freq/' + keyword +
                    '_bad.csv')
            except:
                goforward = False
            if goforward:
                validkeywords.append(keyword)
                #diff_table = get_diff_table(good_tab, bad_table, tokenset, common_tokens=common_tokens)
                diff_tables_topntokens[keyword] = {}
                #diff_tables[keyword] = diff_table
                for topntokens in range(10, 51):
                    diff_table_topntokens = get_diff_table(
                        good_tab,
                        bad_table,
                        tokenset,
                        common_tokens=True,
                        topntokens=topntokens)
                    diff_tables_topntokens[keyword][
                        topntokens] = diff_table_topntokens
            print('------------------------------------------------------')
            print(
                str(time.time() - start_time) +
                ' seconds to build the difference table for ' + keyword)
        print("start writing difference matrices")
        #indexmanager.build_token_index(tokenset)
        token_index = indexmanager.get_token_index()
        if not os.path.exists(
                'resources/bow/tourist_hotel_country_freq/diff/topntokens/'):
            os.makedirs(
                'resources/bow/tourist_hotel_country_freq/diff/topntokens/')
        for keyword in validkeywords:
            start_time = time.time()
            print(keyword + ' ---- ' +
                  time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            #country_tourist_ind = indexmanager.get_tourist_country_index()
            country_ind = indexmanager.get_hotel_country_index()
            if not os.path.exists(
                    'resources/bow/tourist_hotel_country_freq/diff/'):
                os.makedirs('resources/bow/tourist_hotel_country_freq/diff/')
            if not os.path.exists(
                    'resources/bow/tourist_hotel_country_freq/diff/topntokens/'
                    + keyword + '/'):
                os.makedirs(
                    'resources/bow/tourist_hotel_country_freq/diff/topntokens/'
                    + keyword + '/')
            '''with open('resources/bow/tourist_hotel_country_freq/diff/' + keyword + '.csv',
                      mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow(['Tourist_Country_Index', 'Tourist_Country', 'Hotel_Country_Index', 'Hotel_Country',
                                 'Total number of unique reviews', 'Token_Index', 'Token', 'Token_Frequence_in_Good',
                                 'Token_Frequence_in_Bad', 'Difference'])
                for countries in diff_tables[keyword].keys():
                    for tok in diff_tables[keyword][countries]['tokens'].keys():
                        goodval=diff_tables[keyword][countries]['tokens'][tok]['good']
                        if goodval!='N/A':
                            goodval="{:.15f}".format(goodval)
                        badval = diff_tables[keyword][countries]['tokens'][tok]['bad']
                        if badval != 'N/A':
                            badval = "{:.15f}".format(badval)
                        writer.writerow([country_ind['country_to_index'][countries[0]], countries[0],
                                         country_ind['country_to_index'][countries[1]], countries[1],
                                         diff_tables[keyword][countries]['count_rev'], token_index['token_to_index'][tok], tok,
                                         goodval,
                                         badval,
                                         "{:.15f}".format(diff_tables[keyword][countries]['tokens'][tok]['diff'])])

            file.close()'''
            for topntokens in diff_tables_topntokens[keyword].keys():

                with open(
                        'resources/bow/tourist_hotel_country_freq/diff/topntokens/'
                        + keyword + '/' + keyword + '_top_' + str(topntokens) +
                        '_tokens.csv',
                        mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow([
                        'Tourist_Country_Index', 'Tourist_Country',
                        'Hotel_Country_Index', 'Hotel_Country',
                        'Total number of unique reviews', 'Token_Index',
                        'Token', 'Token_Frequence_in_Good',
                        'Token_Frequence_in_Bad', 'Difference'
                    ])
                    for countries in diff_tables_topntokens[keyword][
                            topntokens].keys():
                        for tok in diff_tables_topntokens[keyword][topntokens][
                                countries]['tokens'].keys():
                            goodval = diff_tables_topntokens[keyword][
                                topntokens][countries]['tokens'][tok]['good']
                            if goodval != 'N/A':
                                goodval = "{:.15f}".format(goodval)
                            badval = diff_tables_topntokens[keyword][
                                topntokens][countries]['tokens'][tok]['bad']
                            if badval != 'N/A':
                                badval = "{:.15f}".format(badval)
                            writer.writerow([
                                country_ind['country_to_index'][countries[0]],
                                countries[0],
                                country_ind['country_to_index'][countries[1]],
                                countries[1], diff_tables_topntokens[keyword]
                                [topntokens][countries]['count_rev'],
                                token_index['token_to_index'][tok], tok,
                                goodval, badval, "{:.15f}".format(
                                    diff_tables_topntokens[keyword][topntokens]
                                    [countries]['tokens'][tok]['diff'])
                            ])

                file.close()

            print(
                str(time.time() - start_time) +
                ' seconds to write the difference matrix for ' + keyword)
コード例 #8
0
import csv
import os
import time

import db
import helper

conn = db.db_connection()
conn.connect()
dbo = db.db_operator(conn)
keywords = helper.getKeywords('booking_keywords.txt')
diff_tables = {}
validkeywords = []
cd = os.getcwd()
'''
            query = 'CREATE TABLE masterthesis.' + keyword + '_diff_filtered_intersection_only ' + \
                    '(Country_of_origin VARCHAR(45) NOT NULL, Country_of_destination VARCHAR(45) NOT NULL, ' \
                    'Token_index SMALLINT NOT NULL, Frequence_difference VARCHAR(45),' \
                    ' PRIMARY KEY (Country_of_origin, Country_of_destination,Token_index));'
            dbo.execute(query)
            #with open('resources/bow/tourist_hotel_country_freq/diff/filtered' + keyword + '_'+emotion+'.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter='|')
            firstrow = next(csv_reader)
            csv_file.close()
            firstrow=firstrow[3:]'''
'''query='CREATE TABLE masterthesis.'+keyword+'_diff_filtered_intersection_only '+ \
      '(Country_of_origin VARCHAR(45) NOT NULL, Country_of_destination VARCHAR(45) NOT NULL, ' \
      'Token_index SMALLINT NOT NULL, Frequence_difference VARCHAR(45),' \
      ' PRIMARY KEY (Country_of_origin, Country_of_destination,Token_index));'
'''
'''for field in firstrow:
コード例 #9
0
def do(originfile):
    keywords = helper.getKeywords(originfile)
    for emotion in ['Good', 'Bad']:
        print("begin " + emotion)
        for keyword in keywords.keys():
            start_time = time.time()
            print(keyword)
            raw_corpus = helper.getRawCorpus(csv_file=open(
                'resources/csvs/' + keyword + '_' + emotion.lower() + '.csv',
                mode='r',
                encoding="utf8",
                newline='\n'),
                                             id_and_country=True)
            print("starting preprocessing")
            stopwords = getStopwords(stopset)
            stwfromtfidf = list(
                TfidfVectorizer(stop_words='english').get_stop_words())
            stopwords = set(list(stopwords) + stwfromtfidf)
            for w in negationstopset:
                stopwords.add(w)
            bow, dictionary, corpus, raw_corpus = documentprocessor.fullpreprocessrawcorpustobow(
                raw_corpus, stopwords, min_count_bigrams=20)

            ###############################################################################
            # Let's see how many tokens and documents we have to train on.
            #

            print('Number of unique tokens: %d' % len(dictionary))
            print('Number of documents: %d' % len(bow))

            ###############################################################################
            # Training
            # --------
            #
            # We are ready to train the LDA model. We will first discuss how to set some of
            # the training parameters.
            #
            # First of all, the elephant in the room: how many topics do I need? There is
            # really no easy answer for this, it will depend on both your data and your
            # application. I have used 10 topics here because I wanted to have a few topics
            # that I could interpret and "label", and because that turned out to give me
            # reasonably good results. You might not need to interpret all your topics, so
            # you could use a large number of topics, for example 100.
            #
            # ``chunksize`` controls how many documents are processed at a time in the
            # training algorithm. Increasing chunksize will speed up training, at least as
            # long as the chunk of documents easily fit into memory. I've set ``chunksize =
            # 2000``, which is more than the amount of documents, so I process all the
            # data in one go. Chunksize can however influence the quality of the model, as
            # discussed in Hoffman and co-authors [2], but the difference was not
            # substantial in this case.
            #
            # ``passes`` controls how often we train the model on the entire corpus.
            # Another word for passes might be "epochs". ``iterations`` is somewhat
            # technical, but essentially it controls how often we repeat a particular loop
            # over each document. It is important to set the number of "passes" and
            # "iterations" high enough.
            #
            # I suggest the following way to choose iterations and passes. First, enable
            # logging (as described in many Gensim tutorials), and set ``eval_every = 1``
            # in ``LdaModel``. When training the model look for a line in the log that
            # looks something like this::
            #
            #    2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations
            #
            # If you set ``passes = 20`` you will see this line 20 times. Make sure that by
            # the final passes, most of the documents have converged. So you want to choose
            # both passes and iterations to be high enough for this to happen.
            #
            # We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat
            # technical, but essentially we are automatically learning two parameters in
            # the model that we usually would have to specify explicitly.
            #

            # Train LDA model.
            from gensim.models import LdaModel

            bestacc = -1
            bestmodel = None
            if len(bow) > 0:
                print(
                    "starting training and checking with different number of topics"
                )
                for numt in range(2, 21):

                    # Set training parameters.
                    num_topics = numt
                    chunksize = 2000
                    passes = 20
                    iterations = 400
                    eval_every = None  # Don't evaluate model perplexity, takes too much time.

                    # Make a index to word dictionary.
                    temp = dictionary[
                        0]  # This is only to "load" the dictionary.
                    id2word = dictionary.id2token

                    model = LdaModel(corpus=bow,
                                     id2word=id2word,
                                     chunksize=chunksize,
                                     alpha='auto',
                                     eta='auto',
                                     iterations=iterations,
                                     num_topics=num_topics,
                                     passes=passes,
                                     eval_every=eval_every)

                    ###############################################################################
                    # We can compute the topic coherence of each topic. Below we display the
                    # average topic coherence and print the topics in order of topic coherence.
                    #
                    # Note that we use the "Umass" topic coherence measure here (see
                    # :py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently
                    # obtained an implementation of the "AKSW" topic coherence measure (see
                    # accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).
                    #
                    # If you are familiar with the subject of the articles in this dataset, you can
                    # see that the topics below make a lot of sense. However, they are not without
                    # flaws. We can see that there is substantial overlap between some topics,
                    # others are hard to interpret, and most of them have at least some terms that
                    # seem out of place. If you were able to do better, feel free to share your
                    # methods on the blog at http://rare-technologies.com/lda-training-tips/ !
                    #

                    top_topics = model.top_topics(bow)  # , num_words=20)
                    acc = computetopacc(top_topics)
                    if acc > bestacc:
                        print("found better model with number of topics: " +
                              str(model.num_topics))
                        bestacc = acc
                        bestmodel = copy.deepcopy(model)
                    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
                    avg_topic_coherence = sum([t[1] for t in top_topics
                                               ]) / num_topics
                    cc.append(avg_topic_coherence)
                    print('Average topic coherence: %.4f.' %
                          avg_topic_coherence)
                savemodel(bestmodel, keyword, emotion, bow)
                print(
                    str(time.time() - start_time) + ' seconds to compute ' +
                    keyword + ' ' + emotion)
コード例 #10
0
def do(originfile):
    keywords = helper.getKeywords(originfile)
    for emotion in ['Good', 'Bad']:
        print("begin " + emotion)
        for keyword in list(keywords.keys()):
            start_time = time.time()
            goforward = True
            print(keyword + ' ---- ' +
                  time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            try:
                with open('resources/bow/' + keyword + '_' + emotion.lower() +
                          '.csv') as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter='|')
                    tokens, country_cluster_hotel, country_cluster_tourist = cluster(
                        csv_reader)
                csv_file.close()
            except:
                goforward = False
            if goforward:
                if not os.path.exists(
                        'resources/bow/country_freq/byhotelcountry/'):
                    os.makedirs('resources/bow/country_freq/byhotelcountry/')
                if not os.path.exists(
                        'resources/bow/country_freq/bytouristcountry/'):
                    os.makedirs('resources/bow/country_freq/bytouristcountry/')
                with open('resources/bow/country_freq/byhotelcountry/' +
                          keyword + '_' + emotion.lower() + '.csv',
                          mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow([''] * 2 + tokens)
                    for country in country_cluster_hotel.keys():
                        writer.writerow(
                            [country] +
                            [country_cluster_hotel[country]['count_rev']] +
                            list(
                                map("{:.15f}".format,
                                    country_cluster_hotel[country]
                                    ['rel_freq'])))
                file.close()
                with open('resources/bow/country_freq/bytouristcountry/' +
                          keyword + '_' + emotion.lower() + '.csv',
                          mode='w') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                    writer.writerow([''] * 2 + tokens)
                    for country in country_cluster_tourist.keys():
                        writer.writerow([country] + [
                            country_cluster_tourist[country]['count_rev']
                        ] + list(
                            map("{:.15f}".format,
                                country_cluster_tourist[country]['rel_freq'])))
                file.close()
            print('------------------------------------------------------')
            print(
                str(time.time() - start_time) + ' seconds to compute ' +
                keyword + ' ' + emotion)
コード例 #11
0
def analyze(originfile, all=False):
    keywords = helper.getKeywords(originfile)
    os.chdir('./resources/stanford-corenlp-full-2018-10-05')
    os.system('kill $(lsof -t -i:9000)')
    cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &'
    time.sleep(4)
    print("starting nlp service")
    with open(os.devnull, "w") as f:
        subprocess.call(cmd, shell=True, stderr=f, stdout=f)
    time.sleep(4)
    print("nlp service started")
    os.chdir('../../')
    nlp_wrapper = StanfordCoreNLP('http://localhost:9000')
    print("Number of processors: ", mp.cpu_count())
    if all:
        print("all")
        '''if not os.path.isfile('/resources/all_test.csv'):
            print("test file created")
            open('./resources/all_test.csv', 'w').close()'''
        conn = db.db_connection()
        dbo = db.db_operator(conn)
        spell = SpellChecker()
        counter = Value('i', 1)
        corpus_tok_all=[]
        '''for i in range(1790):
            print('i=' +str(i))
            print("limit= 10000")
            print("offset= "+str(10000*i))
            conn.connect()
            query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \
                    'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \
                    'FROM masterthesis.reviews, masterthesis.hotels ' \
                    'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';'
            results = [list(x) for x in dbo.execute(query)];
            conn.disconnect()
            print("got results from sql")
            print("starting analysis")
            print("tot number rows= " + str(len(results)))
            try:
                print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), )
                corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200)
                pool.close()
                pool.terminate()
                pool.join()
                print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            except TimeoutError:
                print("timeout error")
                pool.close()
                pool.terminate()
                pool.join()
                corpus_tok=[]
                for doc in results:
                    try:
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60)
                        #print('pool close')
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()

                    except TimeoutError:
                        print(str(doc)+" caused Exception")
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()
                        c=[None]
                    corpus_tok.append(c[0])
            print("beginning removal of sents with contrast")
            corpus_tok = [r for r in corpus_tok if r != None]
            print('len corpus_tok_reduced= '+str(len(corpus_tok)))
            corpus_tok_all+=corpus_tok
            print('len corpus_tok_all= ' + str(len(corpus_tok_all)))
            if i%100==0 and i!=0:
                with open('./resources/all_test.csv', mode='a') as file:
                    writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    for c in corpus_tok_all:
                        writer.writerow(c)
                file.close()
                corpus_tok_all=[]
        '''


        '''
        corpus_tok_all=[]
        i=0
        kk=set()
        with open('./resources/all_test.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                #if i%10000==0:break
                ar=((row[0].replace('[','')).replace(']','')).split(',')
                if ar[1][-1]!="'":#France, Metro.
                    ar[1]=ar[1]+','+ar[2]
                    for j in range(2,len(ar)-1):
                        ar[j]=ar[j+1]
                    del ar[len(ar)-1]
                ar[1]=ar[1][2:-1]
                ar[2] = (ar[2].replace("'", '')).replace(' ', '')
                rev=''.join(ar[3:])
                revlist= ar[:3]
                revlist.append(rev)
                tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',')
                r=(revlist,tokens)
                k=ar[0]
                if k not in kk:
                    kk.add(k)
                    corpus_tok_all.append(r)
        file.close()
        corpus_tok=corpus_tok_all
        corpustokonly = [r[1] for r in corpus_tok]
        print("doing bigrams")
        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
        lenc=len(corpus_tok)
        print("corpus_tok len = "+str(lenc))
        for idx in range(lenc):
            if idx%100000==0:
                print(idx)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            for token in bigram[corpustokonly[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    corpus_tok[idx][1].append(token)
        with open('./resources/corpus_tok_all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                writer.writerows(corpus_tok)
        file.close()
        print("corpus_tok written")
        from gensim.corpora import Dictionary
        print("writing frequence file")
        '''

        



        '''all_set=set()
        for emotion in ['Good', 'Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if not (keyword == 'cleaning' or keyword=='pet'):
                    start_time = time.time()
                    print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    raw_corpus = helper.getRawCorpus(
                        csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                      encoding="utf8", newline='\n'), additionaldetails=True)
                    # corpus = helper.getCorpusTextFromRaw(raw_corpus)
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    print("starting analysis")
                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,
                                   initargs=(counter, spell, nlp_wrapper,), )
                    corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get()
                    print('pool close')
                    pool.close()
                    print('pool join')
                    pool.join()
                    print("beginning removal of sents with contrast")
                    corpus_tok = [r for r in corpus_tok if r != None]
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    print('len all_set_tok before= ' + str(len(all_set)))
                    print('len corpus_tok= ' + str(len(corpus_tok)))
                    print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set)))
                    for sen in corpus_tok:
                        all_set.add((tuple(sen[0]),tuple(sen[1])))
                    print('len all_set_tok after= ' + str(len(all_set)))
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
        # Compute bigrams.
        if len(all_set) > 0:
            corpus_tok=[(list(x[0]),list(x[1])) for x in all_set]
            corpustokonly = [r[1] for r in corpus_tok]
            print("doing bigrams")
            # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
            bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
            for idx in range(len(corpus_tok)):
                for token in bigram[corpustokonly[idx]]:
                    if '_' in token:
                        # Token is a bigram, add to document.
                        corpus_tok[idx][1].append(token)
            from gensim.corpora import Dictionary
            print("writing frequence file")

            # Create a dictionary representation of the documents.
            dictionary = Dictionary(corpustokonly)

            alltok = []
            freq = []
            for doc in corpustokonly:
                for tok in doc:
                    alltok.append(tok)
            lencorpus = len(corpus_tok)
            print("len dictionary = " + str(len(dictionary.keys())))
            i = 0
            for t in dictionary:
                i += 1
                if i % 1000 == 0:
                    print("analyzing token " + str(i))
                freqsent = 0
                for doc in corpustokonly:
                    if dictionary.get(t) in doc:
                        freqsent += 1
                freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)),
                             alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus))
            freq.sort(key=lambda tup: tup[5], reverse=True)
            for i in range(len(freq)):
                freq[i] = tuple(list(freq[i]) + [i])
            if not os.path.exists('resources/bow/allfreq/stanford/'):
                os.makedirs('resources/bow/allfreq/stanford/')
            with open('resources/bow/allfreq/stanford/all.txt',
                      'w') as f:
                for item in freq:
                    f.write(str(item) + '\n')
                f.close()

            print("writing bow file")
            top_tokens = [f[1] for f in freq[:500]]
            lentoptok = len(top_tokens)
            corpus_bow = {}
            toplen = 0
            for i in range(len(corpus_tok)):
                corpus_bow[i] = [0] * lentoptok
                if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                    toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
                for tok in corpus_tok[i][1]:
                    if tok in top_tokens:
                        corpus_bow[i][top_tokens.index(tok)] = 1

            with open('resources/bow/all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow([''] * toplen + top_tokens)
                for i in corpus_bow.keys():
                    writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * (
                            toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i])
            file.close()
        '''
        


        # Create a dictionary representation of the documents.
        '''dictionary = Dictionary(corpustokonly)

        alltok = []
        freq = []
        for doc in corpustokonly:
            for tok in doc:
                alltok.append(tok)
        lencorpus = len(corpus_tok)
        print("len dictionary = " + str(len(dictionary.keys())))
        time.sleep(100000)
        counter = Value('i', 0)
        pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), )
        print("pool initialized")
        corpustokonly=None
        alltok=None
        del corpustokonly, alltok
        freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get()
        pool.close()
        pool.terminate()
        pool.join()
        dictionary=None
        del dictionary
        global ctonly, dic, alltoks
        ctonly=None
        dic=None
        alltoks=None
        del ctonly,dic,alltoks
        print("frequence list len= "+str(len(freq)))
        print("frequence list created")
        freq.sort(key=lambda tup: tup[5], reverse=True)
        print("frequence list sorted")
        for i in range(len(freq)):
            if i%10000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            freq[i] = tuple(list(freq[i]) + [i])
        print("frequence list modified")
        if not os.path.exists('resources/bow/allfreq/stanford/'):
            os.makedirs('resources/bow/allfreq/stanford/')
        i=0
        '''
        '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f:
            for item in freq:
                i+=1
                if i%10000==0:
                    print(i)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                f.write(str(item) + '\n')
            f.close()'''

        corpus_tok=[]
        i=0
        with open('./resources/corpus_tok_all.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                corpus_tok.append(row)
        file.close()
        print("len corpus_tok= "+str(len(corpus_tok)))
        freq=[]
        i=0
        with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i==501:break
                freq.append(row)
        file.close()
        for i in range(len(freq)):
            freq[i]=freq[i][0]
            freq[i]=freq[i].replace("'",'')
            freq[i]=freq[i].replace('"','')
            freq[i]=freq[i].replace('(','')
            freq[i]=freq[i].replace(')','')
            freq[i]=freq[i].replace(' ','')
            freq[i]=freq[i].split(',')
            freq[i]=tuple(freq[i])
        for i in range(len(corpus_tok)):
            if i%100000==0:
                print(i)
            corpus_tok[i][0]=corpus_tok[i][0].replace('[','')
            corpus_tok[i][0]=corpus_tok[i][0].replace(']','')
            det=(corpus_tok[i][0].split(','))
            if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe
                det[1]='  '+'São Tomé and PrÃ\xadncipe'+' '
            if det[1][-1]!="'":#France, Metro
                if 'Ivoire' in det[1]:#Cote d'Ivoire
                    det[1]=det[1].replace('\\','')
                    det[2]=det[2][1:]
                else:
                    det[1]=det[1]+','+det[2]
                    for j in range(2,len(det)-1):
                        det[j]=det[j+1]
                    del det[len(det)-1]
            det=det[:3]
            desc=(corpus_tok[i][0].split(','))[-1]
            det[0]=det[0][1:-1]
            det[1]=det[1][2:-1]
            det[2]=det[2][2:-1]
            desc=desc[3:-1]
            det.append(desc)
            corpus_tok[i][0]=det
            corpus_tok[i][1]=corpus_tok[i][1].replace("'",'')
            corpus_tok[i][1]=corpus_tok[i][1].replace(' ','')
            corpus_tok[i][1]=corpus_tok[i][1].replace('[','')
            corpus_tok[i][1]=corpus_tok[i][1].replace(']','')
            corpus_tok[i][1]=corpus_tok[i][1].split(',')
        print("writing bow file")
        top_tokens = [f[1] for f in freq[:400]]
        lentoptok = len(top_tokens)
        corpus_bow = {}
        toplen = 0
        print("corpus_tok_len= "+str(len(corpus_tok)))
        for i in range(len(corpus_tok)):
            corpus_bow[i] = [0] * lentoptok
            if i%100000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
            for tok in corpus_tok[i][1]:
                if tok in top_tokens:
                    corpus_bow[i][top_tokens.index(tok)] = 1
        print("len corpus_bow keys= "+str(len(corpus_bow.keys())))
        print("got corpus_bow")
        j=0
        print("corpus_bow_len "+str(len(corpus_bow)))
        with open('resources/bow/all.csv', mode='w') as file:
            writer = csv.writer(file, delimiter='|', quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([''] * toplen + top_tokens)
            for i in corpus_bow.keys():
                j+=1
                if j%100000==0:
                    print(j)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                writer.writerow(
                    corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) +
                    corpus_bow[i])
        file.close()
        print("over")
    else:
        print("not all")
        for emotion in ['Good','Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if emotion=='Good' and keyword=='cleaning':#cleaning good
                    start_time = time.time()
                    print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    corpus_tok_all=[]
                    #if not os.path.isfile('/resources/cleaning_test.csv'):
                        #open('./resources/cleaning_test.csv', 'w').close()
                    for i in range(400):#400
                        print(str(i))
                        offset=i*1000
                        limit=1000
                        print("starting reading")
                        print("limit="+str(limit))
                        print("offset="+str(offset))
                        raw_corpus = helper.getRawCorpus(
                            csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                          encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset)

                        #corpus = helper.getCorpusTextFromRaw(raw_corpus)
                        #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)]
                        #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):]
                        print("starting analysis")
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        try:
                            corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30)
                            pool.close()
                            pool.join()
                        except TimeoutError:
                            print("timeout error")
                            print('pool close')
                            pool.close()
                            print('pool terminate')
                            pool.terminate()
                            print('pool join')
                            pool.join()
                            corpus_tok=[]
                            for doc in raw_corpus:
                                try:
                                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                                    c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30)
                                    #print('pool close')
                                    pool.close()
                                    #print('pool join')
                                    pool.join()
                                    '''thread = threading.Thread(target = thread_function_row_only, args = (doc))
                                    thread.start()
                                    thread.join()
                                    c=que.get()'''
                                except TimeoutError:
                                    print(str(doc)+" caused Exception")
                                    c=[None]
                                corpus_tok.append(c[0])
                        corpus_tok_reduced=[r for r in corpus_tok if r != None]
                        print("len corpus_tok: " + str(len(corpus_tok)))
                        print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced)))
                        '''with open('./resources/cleaning_test.csv', mode='a') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                            for c in corpus_tok_reduced:
                                writer.writerow(c)
                        file.close()'''
                        corpus_tok_all+=corpus_tok_reduced
                        print("len corpus_tok_all: " + str(len(corpus_tok_all)))
                    '''
                    corpus_tok=[]
                    s=0
                    for doc in corpus:
                        newdoc=False
                        doc = doc.lower()
                        s += 1
                        if s % 10000 == 0:
                            print(str(s))
                        for con in constr_conjs:
                            if con in doc:
                                newdoc=True
                                break
                        if not newdoc:
                            toks = [spell.correction(tok['lemma']) for tok in
                                    nlp_wrapper.annotate(doc,
                                                         properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[
                                        'sentences'][0]['tokens']
                                    if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1]
                            toapp = []
                            for i in range(len(toks)):
                                if '/' in toks[i]:
                                    for tok in toks[i].split('/'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            toapp = []
                            for i in range(len(toks)):
                                if '-' in toks[i]:
                                    for tok in toks[i].split('-'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            corpus_tok.append(toks)'''
                    #print("beginning removal of sents with contrast")
                    corpus_tok=corpus_tok_all
                    print("len corpus_tok: " + str(len(corpus_tok)))
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    # Compute bigrams.
                    if len(corpus_tok)>0:
                        corpustokonly=[r[1] for r in corpus_tok]
                        print("doing bigrams")
                        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
                        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
                        for idx in range(len(corpus_tok)):
                            for token in bigram[corpustokonly[idx]]:
                                if '_' in token:
                                    # Token is a bigram, add to document.
                                    corpus_tok[idx][1].append(token)
                        from gensim.corpora import Dictionary
                        print("writing frequence file")

                        # Create a dictionary representation of the documents.
                        dictionary = Dictionary(corpustokonly)

                        alltok = []
                        freq=[]
                        for doc in corpustokonly:
                            for tok in doc:
                                alltok.append(tok)
                        lencorpus=len(corpus_tok)
                        print("len dictionary = "+str(len(dictionary.keys())))
                        i=0
                        for t in dictionary:
                            i+=1
                            if i%1000==0:
                                print("analyzing token "+str(i))
                            freqsent = 0
                            for doc in corpustokonly:
                                if dictionary.get(t) in doc:
                                    freqsent+=1
                            freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus))
                        freq.sort(key=lambda tup: tup[5], reverse=True)
                        for i in range(len(freq)):
                            freq[i]=tuple(list(freq[i])+[i])
                        if not os.path.exists('resources/bow/allfreq/stanford/'):
                            os.makedirs('resources/bow/allfreq/stanford/')
                        with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f:
                            for item in freq:
                                f.write(str(item)+'\n')
                            f.close()

                        print("writing bow file")
                        top_tokens=[f[1] for f in freq[:500]]
                        lentoptok=len(top_tokens)
                        corpus_bow={}
                        toplen=0
                        for i in range(len(corpus_tok)):
                            corpus_bow[i]=[0]*lentoptok
                            if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen:
                                toplen=len(corpus_tok[i][0]+corpus_tok[i][1])
                            for tok in corpus_tok[i][1]:
                                if tok in top_tokens:
                                    corpus_bow[i][top_tokens.index(tok)]=1

                        with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"',
                                                         quoting=csv.QUOTE_MINIMAL)
                            writer.writerow(['']*toplen+top_tokens)
                            for i in corpus_bow.keys():
                                writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i])
                        file.close()
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
    f.close()
コード例 #12
0
def do(originfile, all=False):
    if all:
        start_time = time.time()
        print('all ----- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
        with open('resources/bow/all.csv') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter='|')
            tokens, cluster_tourist_hotel = cluster_all(csv_reader)
        csv_file.close()
        if not os.path.exists('resources/bow/tourist_hotel_country_freq/'):
            os.makedirs('resources/bow/tourist_hotel_country_freq/')
        print("got cluster of all, start writing")
        with open('resources/bow/tourist_hotel_country_freq/all.csv',
                  mode='w') as file:
            writer = csv.writer(file,
                                delimiter='|',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([''] * 2 + ['unique IDs'] + tokens)
            del tokens
            for country in cluster_tourist_hotel.keys():
                writer.writerow(
                    [country[0], country[1]] +
                    [cluster_tourist_hotel[country]['count_rev']] + list(
                        map("{:.15f}".format, cluster_tourist_hotel[country]
                            ['rel_freq'])) +
                    list(cluster_tourist_hotel[country]['unique_reviews']))
        file.close()
        print('------------------------------------------------------')
        print(str(time.time() - start_time) + ' seconds to compute all')
    else:
        keywords = helper.getKeywords(originfile)
        for emotion in ['Good', 'Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                start_time = time.time()
                goforward = True
                print(keyword + ' ---- ' +
                      time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                try:
                    with open('resources/bow/' + keyword + '_' +
                              emotion.lower() + '.csv') as csv_file:
                        csv_reader = csv.reader(csv_file, delimiter='|')
                        tokens, cluster_tourist_hotel = cluster(csv_reader)
                    csv_file.close()
                except:
                    goforward = False
                if goforward:
                    if not os.path.exists(
                            'resources/bow/tourist_hotel_country_freq/'):
                        os.makedirs(
                            'resources/bow/tourist_hotel_country_freq/')
                    with open('resources/bow/tourist_hotel_country_freq/' +
                              keyword + '_' + emotion.lower() + '.csv',
                              mode='w') as file:
                        writer = csv.writer(file,
                                            delimiter='|',
                                            quotechar='"',
                                            quoting=csv.QUOTE_MINIMAL)
                        writer.writerow([''] * 2 + ['unique IDs'] + tokens)
                        for country in cluster_tourist_hotel.keys():
                            writer.writerow(
                                [country[0], country[1]] +
                                [cluster_tourist_hotel[country]['count_rev']] +
                                list(
                                    map(
                                        "{:.15f}".format,
                                        cluster_tourist_hotel[country]
                                        ['rel_freq'])) +
                                list(cluster_tourist_hotel[country]
                                     ['unique_reviews']))
                    file.close()
                print('------------------------------------------------------')
                print(
                    str(time.time() - start_time) + ' seconds to compute ' +
                    keyword + ' ' + emotion)