class DictionaryMaker:
    def __init__(self, language_code):
        self.language_codes = {
            'en': 'english',
            'es': 'spanish',
            'fr': 'french'
        }
        self.language_code = language_code
        self.wordcount = WordCount(self.language_codes[language_code])
        self.wordcount_dictionary = {}

    #parse only concrete language
    def parse_language(self,
                       directory,
                       max_parsed_pressnotes=None
                       ):  #max_parsed_pressnotes=None -> no limit
        n = 0
        for root, subFolders, files in os.walk(directory):
            for file in files:
                if root.split(os.sep)[-1].startswith(
                        self.language_code) and file == 'rss_unique.csv':
                    print os.path.join(root, file)
                    pressnote_list = PressNote.load_list(
                        os.path.join(root, file))

                    for pressnote in pressnote_list:
                        self.wordcount.parse_text(pressnote.title,
                                                  self.wordcount_dictionary)
                        self.wordcount.parse_text(pressnote.text,
                                                  self.wordcount_dictionary)
                        n += 1
                        if max_parsed_pressnotes is not None and n > max_parsed_pressnotes:
                            break
        print "Parsed: " + str(n) + " press notes"

    #parse only concrete file
    def parse_language2(self,
                        file_path,
                        max_parsed_pressnotes=None
                        ):  #max_parsed_pressnotes=None -> no limit
        n = 0
        print file_path
        pressnote_list = PressNote.load_list(file_path)

        for pressnote in pressnote_list:
            self.wordcount.parse_text(pressnote.title,
                                      self.wordcount_dictionary)
            self.wordcount.parse_text(pressnote.text,
                                      self.wordcount_dictionary)
            n += 1
            if max_parsed_pressnotes is not None and n > max_parsed_pressnotes:
                break
        print "Parsed: " + str(n) + " press notes"

    #parse all languages with english version
    def parse(self,
              directory,
              max_parsed_pressnotes=None
              ):  #max_parsed_pressnotes=None -> no limit
        n = 0
        for root, subFolders, files in os.walk(directory):
            for file in files:
                root_wanted = root.split(os.sep)[-1].startswith(
                    self.language_code)  #there is translation in rss2.csv
                if (root_wanted and file == 'rss_unique.csv') or (
                        not root_wanted and file == 'rss_en.csv'):
                    print os.path.join(root, file)
                    pressnote_list = PressNote.load_list(
                        os.path.join(root, file))

                    for pressnote in pressnote_list:
                        self.wordcount.parse_text(pressnote.title,
                                                  self.wordcount_dictionary)
                        self.wordcount.parse_text(pressnote.text,
                                                  self.wordcount_dictionary)
                        n += 1
                        if max_parsed_pressnotes is not None and n > max_parsed_pressnotes:
                            break
        print "Parsed: " + str(n) + " press notes"

    def dump(self,
             dictionary_name,
             dict_max_size=None):  #dict_max_size=None -> no limit
        sorted_wordcount = sorted(self.wordcount_dictionary.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        if (dict_max_size is not None):
            sorted_wordcount = sorted_wordcount[:dict_max_size]
        with open(dictionary_name, 'w') as f:
            keys = [item[0] + " " + str(item[1]) for item in sorted_wordcount]
            f.write('\n'.join(keys))
Ejemplo n.º 2
0
class BagOfWords:
    def __init__(self,
                 language_code,
                 dictionary_path,
                 dir_notes,
                 dict_max_size=None):  #wersja klasterujaca wszystkie pliki
        self.bag_of_words = {}
        self.language_code = language_code
        language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'}
        self.wordcount = WordCount(language_codes[language_code])
        self.word_indexes = self.load_dictionary(dictionary_path,
                                                 dict_max_size)
        self.create(dir_notes)

    def __init__(self,
                 language_code,
                 output_dir,
                 dir_notes,
                 dict_max_size=None
                 ):  #wersja klastrujaca notki z kazdego pliku z osobna
        self.bag_of_words = {}
        self.language_code = language_code
        language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'}
        self.wordcount = WordCount(language_codes[language_code])
        self.dict_max_size = dict_max_size
        self.create2(dir_notes, output_dir)

    @staticmethod
    def load_dictionary(dictionary_path,
                        dict_max_size=None):  #dict_max_size=None -> no limit
        dict_indexes = {}
        index = 0
        with open(dictionary_path, 'r') as f:
            for line in f:
                dict_indexes[line.split(' ')[0]] = index
                index += 1
                if (dict_max_size is not None and index >= dict_max_size):
                    break

        return dict_indexes

    #parse all languages with english version
    def create(self, dir_notes):
        pressnote_list = []
        for root, subFolders, files in os.walk(dir_notes):
            for file in files:
                root_wanted = root.split(os.sep)[-1].startswith(
                    self.language_code)  #there is translation in rss2.csv
                if (root_wanted and file == 'rss_unique.csv') or (
                        not root_wanted and file == 'rss_en.csv'):
                    print os.path.join(root, file)
                    pressnote_list.extend(
                        PressNote.load_list(os.path.join(root, file)))

        for pressnote in pressnote_list:
            note_dictionary = {}
            self.wordcount.parse_text(pressnote.title, note_dictionary)
            self.wordcount.parse_text(pressnote.text, note_dictionary)
            word_vector = [0] * len(self.word_indexes)
            for key in note_dictionary:
                if key in self.word_indexes:
                    idx = self.word_indexes[key]
                    word_vector[idx] = note_dictionary[key]
            self.bag_of_words[pressnote] = word_vector

        print "Created bag of words: " + str(len(
            self.bag_of_words)) + " x " + str(
                len(self.bag_of_words[pressnote_list[0]])) + "\n"

    #parse all files
    def create2(self, dir_notes, output_dir):
        pressnote_list = []
        for root, subFolders, files in os.walk(dir_notes):
            for file in files:
                self.bag_of_words = {}
                root_wanted = root.split(os.sep)[-1].startswith(
                    self.language_code)  #there is translation in rss2.csv
                pattern1 = re.compile(r'^rss_unique(\d*)\.csv$')
                pattern2 = re.compile(r'^rss_en(\d*)\.csv$')
                if (root_wanted and pattern1.match(file)) or (
                        not root_wanted and pattern2.match(file)):
                    pressnote_list = PressNote.load_list(
                        os.path.join(root, file))

                    dictionary_maker = DictionaryMaker(self.language_code)
                    dictionary_maker.parse_language2(os.path.join(root, file))
                    dictionary_maker.dump(output_dir + os.sep +
                                          'temp_dictionary.txt')
                    self.word_indexes = self.load_dictionary(
                        output_dir + os.sep + 'temp_dictionary.txt',
                        self.dict_max_size)

                    for pressnote in pressnote_list:
                        note_dictionary = {}
                        self.wordcount.parse_text(pressnote.title,
                                                  note_dictionary)
                        self.wordcount.parse_text(pressnote.text,
                                                  note_dictionary)
                        word_vector = [0] * len(self.word_indexes)
                        for key in note_dictionary:
                            if key in self.word_indexes:
                                idx = self.word_indexes[key]
                                word_vector[idx] = note_dictionary[key]
                        self.bag_of_words[pressnote] = word_vector

                    print "Created bag of words: " + str(len(
                        self.bag_of_words)) + " x " + str(
                            len(self.bag_of_words[pressnote_list[0]])) + "\n"

                    match_file = re.match(r'(rss_unique|rss_en)(\d*)\.csv',
                                          file)
                    number = match_file.group(2)
                    if number is None:
                        number = ""
                    self.cluster(output_dir + os.sep + root.split(os.sep)[-2] +
                                 os.sep + 'cluster_' + root.split(os.sep)[-1] +
                                 number + '.txt')

    def cluster(self, clusters_file_path):
        X = []
        Y = {}
        for key in self.bag_of_words:
            X.append(self.bag_of_words[key])

        # pca = PCA(n_components=min(len(X[0]), 5000))
        # pca.fit(X)
        # X = pca.transform(X)
        ## pca = TruncatedSVD(n_components=100)
        ## X = pca.fit_transform(X)
        # print "PCA - done"
        # print "Truncated bag of words of size: " + str(len(X)) + " x " + str(len(X[0])) + "\n"

        ward = AgglomerativeClustering(n_clusters=max(len(X) / 30, 50),
                                       linkage='ward').fit(X)

        print "Clusters created: " + str(max(len(X) / 30, 50))

        idx = 0
        for key in self.bag_of_words:
            Y[key] = ward.labels_[idx]
            idx += 1

        sorted_clusters = sorted(Y.items(),
                                 key=operator.itemgetter(1),
                                 reverse=False)

        cluster_number = 0
        with open(clusters_file_path, 'w') as f:
            f.write(str(cluster_number) + "\n")
            for cluster in sorted_clusters:
                if cluster_number != cluster[1]:
                    cluster_number = cluster[1]
                    f.write("\n" + str(cluster_number) + "\n")
                f.write(str(cluster[0]))

        print "Clusters saved: " + clusters_file_path + "\n"