def __init__(self, context_n=3): self.featureVector = FeatureVector() self.phrase_dim = self.featureVector.dim() self.phr_cox_dim = self.phrase_dim * 4
def __init__(self): self.__featureVector = FeatureVector()
class Preprocessor(): def __init__(self, context_n=3): self.featureVector = FeatureVector() self.phrase_dim = self.featureVector.dim() self.phr_cox_dim = self.phrase_dim * 4 def getDim(self): return self.phrase_dim, self.phr_cox_dim ''' Transforms words into vectors using FeatureVector and stacks vectors into matrix ''' def concat(self, phrase, tags): phrase = phrase.split() ret_array = np.reshape( self.featureVector.vectorise(phrase[0], tags[0]), (1, -1)) if len(phrase) > 1: for i in range(len(phrase)): ret_array = np.vstack([ ret_array, self.featureVector.vectorise(phrase[i], tags[i]) ]) return ret_array ''' Returns concatenated vector of maximal and minimal features for given phrase vectors and context vectors ''' def minMaxVector(self, phrase_vec, context_vec): phrase_max_min = np.concatenate( (phrase_vec.max(axis=0), phrase_vec.min(axis=0))) context_max_min = np.concatenate( (context_vec.max(axis=0), context_vec.min(axis=0))) vector = np.concatenate((phrase_max_min, context_max_min)) return vector ''' Main logic that for phrases, context and/or skills returns its vector values. ''' def preprocess(self, noun_phrases, context, np_tags, context_tags, skills=False): phrases_vec = [] context_vec = [] phr_cox_vec = [] y = [] if skills != False: skills = [x.lower() for x in skills] for i in range(len(noun_phrases)): current_phrase_vec = self.concat(noun_phrases[i], np_tags[i]) phrases_vec.append(current_phrase_vec) current_context_vec = self.concat(context[i], context_tags[i]) context_vec.append(current_context_vec) phr_cox_vec.append( self.minMaxVector(current_phrase_vec, current_context_vec)) if skills != False: if noun_phrases[i].lower() in skills: y.append(1) else: y.append(0) if skills != False: return phrases_vec, context_vec, phr_cox_vec, y return phrases_vec, context_vec, phr_cox_vec
class ArffHandler: def __init__(self): self.__featureVector = FeatureVector() ''' def generateArffFile(filename, data): Generate sparse Arff files. Inputs: Filename: Any name you like. A file named "filename".arff will be created. *** Data format must be *** data = {"$PageId": { "class":$X, "data":{$id1:$x1,$id2:$x2,$id3:$x3,$id4:$x4...} } } example: data = {5000: { "class":1, "data":{"medicine":400,"cars":3,"health":9999,"Metallica":10...} } } ''' def generateArffFile(self, filename, data): self.__featureVector.createCountMap(data) file = open(filename + ".arff", "w") file.write("@RELATION " + filename + "\n") sortedList = self.__featureVector.getSortedVocabularyMap() #TWO FIRST ATTRIBUTES ARE RESERVED! file.write("@ATTRIBUTE CLASS {0,1}\n") file.write("@ATTRIBUTE CATEGORYNAME STRING\n") for attribute in sortedList: file.write("@ATTRIBUTE " + attribute[0] + "-" + str(attribute[1]) + " NUMERIC\n") # file.write("@ATTRIBUTE name NUMERIC") file.write("@DATA\n") for categoryName, innerMap in data.iteritems(): catClass = innerMap["class"] file.write("{0 " + catClass) file.write(",1 " + categoryName) for k, v in innerMap["data"].iteritems(): file.write("," + str(self.__featureVector.getIndex(k)) + " " + str(v)) file.write("}\n") file.close() logging.info("Created file on local drive: %s.arff", filename) return filename + ".arff" def readArffFile(self, filename): featureVector = [] # list of maps featureNameList = [] with open(filename, "r") as f: for line in f: if not line.strip().startswith('@') and len(line.strip()) > 0: # start of actual data pairs = line.strip(" {}\n").split(',') featureMap = {} for attr in pairs: (key, value) = attr.split(' ', 1) featureMap[int(key)] = value.strip().lower() featureVector.append(featureMap) else: # feature definitions if (not line.strip().lower().startswith('@data')) and (not line.lower().startswith('@relation')): attrName = line.strip().split()[1] if attrName.find('-') != -1: (name, idx) = attrName.split('-') featureNameList.append(name) self.__featureVector.updateVocabularyMap(name, idx) logging.debug("Updated %s with index %s", name, idx) else: featureNameList.append(attrName) logging.info("Read arff file. Extracted feature vector") return [featureNameList, featureVector] def generateFeatureVector(self, data, categoryName): return self.__featureVector.generateFeatureVector(data, categoryName)