def features(inputDir): features = [] labels = [] #getting data from given directory as list csv_data = list(tools.get_data(inputDir)) for dialogue in csv_data: for i in range(len(dialogue)): features.append(sentFeatures(dialogue, i)) labels.append(sentLabels(dialogue[i])) return features, labels, csv_data
# -*- coding: utf-8 -*- """ Created on Sat Apr 11 12:45:35 2020 @author: Likhita Suresh """ import pycrfsuite import hw2_corpus_tools as tools import os path = os.getcwd() + "\\trainSmall" print(path) csv_data = list(tools.get_data(path)) print(csv_data)
import pycrfsuite import hw2_corpus_tools as tools import os, sys #getting path to train date - appending train data file to current path #path = os.getcwd()+"\\trainSmall" inputDir = sys.argv[1] testDir = sys.argv[2] outputFile = sys.argv[3] #print (path) #print (csv_data[0][0][2][0].token) csv_data = list(tools.get_data(inputDir)) #feature for every token def tokenFeatures(wordList): tokens = [] letter_pairs = ["th", "he", "in"] for word in wordList: tokens.append('TOKEN_' + word.token) for pair in letter_pairs: if pair in word: tokens.append('TOKEN_' + pair) for i in range(len(wordList) - 1): tokens.append('TOKEN_' + wordList[i].token + wordList[i + 1].token) return tokens