Beispiel #1
0
class Group(object):
    # Represent a group of data
    # Groups can be compared to eachother
    # Groups try to combine common data stores in more than 1 data file
    
    _headings = []
    _headingsType = []
    _currRow = 0
        
    def __init__(self, name, fileTypes):
        self._name = name
        self._fileTypes = fileTypes
        
       
    # @param filenameList: List of filenames that contains data.
    # @summary: 
    #    Uses the file parsers to parse each data file and combine common data
    def createGroup(self, filenameList):
                
        # Get all the possible headers in the files
        for filename in filenameList:
            if filename.endswith('.csv'):
                self._parser = CSVParser(filename)
            
            self._parser.parse()
            for heading in self._parser.getHeadings():
                if self._headings.count(heading) == 0:
                    self._headings.append(heading)
                    self._headingsType.append(self._parser.getHeadingType(heading))
            
        
        db = SQL("C:\\PerGraph\\" + self._name + ".db")
        db.connect()
        db.createTable("data",self._headings, self._headingsType)
        db.commit()
        db.close()
       
        
                
                
            
            
    
    def sort(self, heading):
        print 'BEFORE'
        for item in self._allData:
            print item
        
        try:
            idx = self._headings.index(heading)
            self._allData.sort(lambda x, y: cmp(x[idx], y[idx]))
        except ValueError:
            print 'Header %s does not exist' % heading
            
        print 'AFTER'
        for item in self._allData:
            print item
Beispiel #2
0
 def createGroup(self, filenameList):
             
     # Get all the possible headers in the files
     for filename in filenameList:
         if filename.endswith('.csv'):
             self._parser = CSVParser(filename)
         
         self._parser.parse()
         for heading in self._parser.getHeadings():
             if self._headings.count(heading) == 0:
                 self._headings.append(heading)
                 self._headingsType.append(self._parser.getHeadingType(heading))
         
     
     db = SQL("C:\\PerGraph\\" + self._name + ".db")
     db.connect()
     db.createTable("data",self._headings, self._headingsType)
     db.commit()
     db.close()
Beispiel #3
0
path_csv = './csvs/words_gre.csv'
path_sname = 'wgre2'
path_addb = './{}.addb'.format(path_sname)
path_wf = './{}.wf'.format(path_sname)
num_gwp = 1

if __name__ == '__main__':
    addb = load_addb(path_addb)
    if not addb: addb = AdapDictDB()

    iqueue = mp.Queue(maxsize=3000)
    oqueue = mp.Queue(maxsize=3000)
    mlock = mp.Lock()

    time_parsecsv = time.time()
    csvparser = CSVParser(path_csv)
    cands_word = csvparser.process_file()
    time_parsecsv = time.time() - time_parsecsv
    print("{:.3f} seconds passed for parsing {} words from {}".format(
        time_parsecsv, len(cands_word), path_csv))

    parsers = list()
    for i in range(num_gwp):
        parsers.append(GetWordProcess(mlock, iqueue, oqueue))
    for i in range(num_gwp):
        parsers[i].start()

    time_getwords = time.time()

    words_in_addb = addb.words.keys()
    words_failed = list()
Beispiel #4
0
    def parse_csv(self, name_csv='default'):
        path_csv = './csvs/{}.csv'.format(name_csv)
        time_parsecsv = time.time()
        csvparser = CSVParser(path_csv)
        cands_word = csvparser.process_file()
        time_parsecsv = time.time() - time_parsecsv
        print("{:.3f} seconds passed for parsing {} words from {}".format(
            time_parsecsv, len(cands_word), path_csv))

        parsers = list()
        for i in range(self.num_gwp):
            parsers.append(GetWordProcess(self.mlock, self.iqueue,
                                          self.oqueue))
        for i in range(self.num_gwp):
            parsers[i].start()

        time_getwords = time.time()

        words_failed = list()

        num_finished_words = 0
        num_words_existed = 0
        num_test = 0
        num_twords = len(cands_word.keys())
        num_to_print = max(int(num_twords / 50), 1)
        num_words_per_iteration = 100

        cws = list(cands_word.keys())

        num_finished = 0
        while num_finished < self.num_gwp:

            if len(cws) > 0 and self.oqueue.qsize() < num_words_per_iteration:
                for cw in cws[:num_words_per_iteration]:
                    if not self.addb.exist_word(cw): self.iqueue.put(cw)
                    else: num_words_existed += 1
                    # cws.remove(cw)
                    time.sleep(0.001)
                cws = cws[num_words_per_iteration:]

            if len(cws) == 0 and self.iqueue.qsize() == 0:
                for i in range(len(parsers)):
                    self.iqueue.put('FINISHED')

            if self.oqueue.qsize() > 0:
                # num_words_got = 0
                rw = self.oqueue.get()
                if rw:
                    if isinstance(rw, list) and rw[0] == 'FINISHED':
                        num_finished += 1
                        words_failed += rw[1]
                    else:
                        try:
                            if not self.addb.add_word(rw):
                                if rw.name not in cands_word.keys():
                                    self.addb.words[rw.name].set_impf(4)
                                    num_twords += 1
                                else:
                                    self.addb.words[rw.name].set_impf(
                                        cands_word[rw.name][0])
                                    self.addb.words[rw.name].add_smean(
                                        cands_word[rw.name][1])
                                # print("new word {} added with impf {}".format(rw.name, addb.words[rw.name].get_impf()))
                                num_finished_words += 1
                                # rw.print_part()
                            else:
                                print("{} is already in addb?".format(rw.name))

                        except Exception as e:
                            print(
                                "[ERROR | ParsingManager/parse_csv] {}".format(
                                    str(e)))

                        if num_finished_words % num_to_print == 0:
                            print(
                                "num_finished_words : {:5d} / {:5d} ({:.2f}%)".
                                format(num_finished_words, num_twords,
                                       num_finished_words / num_twords * 100))

            else:
                time.sleep(0.05)

        for i in range(self.num_gwp):
            parsers[i].terminate()
            parsers[i].join()

        time_getwords = time.time() - time_getwords

        print("\n\n\n{:.3f} seconds passed for processing {} words".format(
            time_getwords, num_twords))
        print("{} of them already existed".format(num_words_existed))
        print("words_failed: {}".format(words_failed))

        save_addb(self.addb, self.path_addb)
        self.save_wordsfailed(self.path_wf, words_failed)
        print("\n")

        for cw in cands_word:
            if cw not in self.addb.words.keys():
                print("\"{}\" is not in addb!".format(cw))

        # self.addb.view_words()

        return num_finished_words