def createInstanceObjectList(processed_dataset): ''' Creates a list of Instance objects from the tokenized input + label ''' print('Reading instances...') instanceObjects = [] #Els: read in tokenised lines #processed_data = [] for item in processed_dataset: tokenized = [] line = item[1] data = line.strip('\n') if data: all_words = word_tokenize(data) content = ' '.join([str(elem) for elem in all_words]) label = item[2] #processed_data.append(tokenized + '\t' + str(label)) instanceObject = Instance(content, label) for i, token in enumerate(content.split()): instanceObject.tokenDictionary[i + 1] = Token(token) if FeatureSelection.getInstance(featureFile).normalizeInstances: instanceObject.tokenDictionary = instanceObject.normalizeTokens() instanceObjects.append(instanceObject) return instanceObjects
def createInstanceObjectList(inputFileName): ''' Creates a list of Instance objects from the input file, which contain all the (linguistic) information needed to extract the features for sentiment polarity classification ''' print ('Reading instances...') instanceObjects = [] with codecs.open(inputFileName, 'r', 'utf8') as inputFile: for line in inputFile: content, label = line.strip().split('\t') instanceObject = Instance(content, label) for i, token in enumerate(content.split()): instanceObject.tokenDictionary[i+1] = Token(token) if FeatureSelection.getInstance(featureFile).normalizeInstances: instanceObject.tokenDictionary = instanceObject.normalizeTokens() instanceObjects.append(instanceObject) return instanceObjects