def read_csv(self): """ Load local data and then generate three important data structures used for smart crawl. **localdata_ids** Collect a set of uniqueid. ('uniqueid1', 'uniqueid2') **localdata_query** Split the fields into a list of words defined by querylist of each message. Filter out stop words and words whose length<3 from the list of words. Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']} **localdata_er** A list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')] """ with open(self.__localPath, 'rb') as csvfile: reader = csv.reader(csvfile) data_raw = [row for row in reader] uniqueid_index = 0 querylist_index = [] matchlist_index = [] try: header = data_raw.pop(0) uniqueid_index = header.index(self.__uniqueId) for q in self.__queryList: querylist_index.append(header.index(q)) for m in self.__matchList: matchlist_index.append(header.index(m)) except ValueError: print >> perr, "Can't find attributes" exit(0) localdata_query = {} localdata_er = [] localdata_ids = set() stop_words = ['and', 'for', 'the', 'with', 'about'] for row in data_raw: try: r_id = row[uniqueid_index] except IndexError: continue localdata_ids.add(r_id) tempbag = [] for q in querylist_index: try: tempbag.extend(wordset(row[q])) except IndexError: continue bag = [] for word in tempbag: if word not in stop_words and len(word) >= 3: bag.append(word) localdata_query[r_id] = bag bag = [] for m in matchlist_index: try: bag.extend(wordset(row[m])) except IndexError: continue localdata_er.append((bag, r_id)) self.setlocalData(localdata_ids, localdata_query, localdata_er)
def proResult(self, result_raw): """ Merge the raw data and keep them in a dict. Then, pre-process the raw data for similarity join. :param result_raw: the raw result returned by api. :return: a list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')] :raises KeyError: some messages would miss some fields. """ result_merge = self.__mergeResult result_er = [] for row in result_raw: try: r_id = eval(self.__uniqueId) except KeyError: continue if r_id not in result_merge: result_merge[r_id] = row bag = [] for v in self.__matchList: try: bag.extend(wordset(eval(v))) except KeyError: continue result_er.append((bag, r_id)) self.setMergeResult(result_merge) return result_er
def read_pickle(self): """ Load local data and then generate three important data structures used for smart crawl. **localdata_ids** Collect a set of uniqueid. ('uniqueid1', 'uniqueid2') **localdata_query** Split the fields into a list of words defined by querylist of each message. Filter out stop words and words whose length<3 from the list of words. Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']} **localdata_er** A list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')] """ with open(self.__localPath, 'rb') as f: data_raw = pickle.load(f) uniqueid = self.__uniqueId.split('.') querylist = [] for q in self.__queryList: querylist.append(q.split('.')) matchlist = [] for m in self.__matchList: matchlist.append(m.split('.')) localdata_record = {} localdata_query = {} localdata_er = [] localdata_ids = set() stop_words = ['and', 'for', 'the', 'with', 'about'] for row in data_raw: r_id = getElement(uniqueid, row) localdata_ids.add(r_id) localdata_record[r_id] = row tempbag = [] for q in querylist: tempbag.extend(wordset(getElement(q, row))) bag = [] for word in tempbag: if word not in stop_words and len(word) >= 3: bag.append(word) localdata_query[r_id] = bag bag = [] for m in matchlist: bag.extend(wordset(getElement(m, row))) localdata_er.append((bag, r_id)) self.setlocalData(localdata_ids, localdata_query, localdata_er, localdata_record)
def read_pickle(self): """ Load sample data and then generate a same data structures as localdata_query used for smart crawl. **sample** Split the fields into a list of words defined by querylist of each message. Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']} """ with open(self.__samplePath, 'rb') as f: sample_raw = pickle.load(f) uniqueid = self.__uniqueId.split('.') querylist = [] for q in self.__queryList: querylist.append(q.split('.')) sample = {} for row in sample_raw: r_id = getElement(uniqueid, row) bag = [] for q in querylist: bag.extend(wordset(getElement(q, row))) sample[r_id] = bag self.setSample(sample)
def read_csv(self): """ Load sample data and then generate a same data structures as localdata_query used for smart crawl. **sample** Split the fields into a list of words defined by querylist of each message. Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']} """ with open(self.__samplePath, 'rb') as csvfile: reader = csv.reader(csvfile) sample_raw = [row for row in reader] uniqueid_index = 0 querylist_index = [] try: header = sample_raw.pop(0) header[0] = header[0].replace(b'\xef\xbb\xbf', '') uniqueid_index = header.index(self.__uniqueId) for q in self.__queryList: querylist_index.append(header.index(q)) except ValueError: print >> perr, "Can't find attributes" exit(0) sample = {} for row in sample_raw: try: r_id = row[uniqueid_index] except IndexError: continue bag = [] for q in querylist_index: try: bag.extend(wordset(row[q])) except IndexError: continue sample[r_id] = bag self.setSample(sample)
def proResult(self, result_raw): """ Merge the raw data and keep them in a dict. Then, pre-process the raw data for similarity join. :param result_raw: the raw result returned by api. :return: a list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')] """ uniqueid = self.__uniqueId.split('.') matchlist = [] for m in self.__matchList: matchlist.append(m.split('.')) result_merge = self.__mergeResult result_er = [] for row in result_raw: r_id = getElement(uniqueid, row) if r_id not in result_merge: result_merge[r_id] = row bag = [] for m in matchlist: bag.extend(wordset(getElement(m, row))) result_er.append((bag, r_id)) self.setMergeResult(result_merge) return result_er
def read_pickle(self): """ Load sample data and then generate a same data structures as localdata_query used for smart crawl. **sample** Split the fields into a list of words defined by querylist of each message. Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']} """ with open(self.__samplePath, 'rb') as f: sample_raw = pickle.load(f) sample = {} for row in sample_raw: try: r_id = eval(self.__uniqueId) except KeyError: continue bag = [] for v in self.__queryList: try: bag.extend(wordset(eval(v))) except KeyError: continue sample[r_id] = bag self.setSample(sample)