def dataSourceToNLP(): df = pd.read_csv(ROOT + '/'+inFilename) sizeStr = str(len(df.index)) # try Vectorization later for index, row in df.iterrows(): text = cleanText(row['content']) doc = nlpc.textToDoc(text) print(str(index) + '/' + sizeStr) # nlpSeqs = nlpc.docToMaxDepthTreeNLPSequenceList(doc, 4) # nlpSeqs = nlpc.docToMaxDepthsTreeNLPSequenceList(doc, maxDepths) treeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH) io.mapListToCsv(ROOT + '/results/' + outFilename, treeDicts, 'a', writeheader=True)
def dataSourceUnevenHeaderToNLP_v2(stopAt=100): df = pd.read_csv(ROOT + '/'+inFilename) sizeStr = str(len(df.index)) treeDicts = [] # try Vectorization later for index, row in df.iterrows(): if(index > stopAt): break text = cleanText(row['content']) doc = nlpc.textToDoc(text) print(str(index) + '/' + sizeStr) # nlpSeqs = nlpc.docToMaxDepthTreeNLPSequenceList(doc, 4) # nlpSeqs = nlpc.docToMaxDepthsTreeNLPSequenceList(doc, maxDepths) tempTreeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH) treeDicts.extend(tempTreeDicts) io.mapListToCsv(ROOT + '/results/' + outFilename_write, treeDicts, 'w', writeheader=True)
def run(): inFilename = inputFolder + '/' + project + '.txt' outProjectFolder = outputFolder + '/' + project text = io.textFileToString(inFilename) doc = nlpc.textToDoc(text) # nlpDicts = [] # sentNlpMapList = nlpc.textToNLPDictsList(text) # nlpDicts += sentNlpMapList # io.mapListToCsv(outProjectFolder + '/nlp_dicts.csv', nlpDicts) # nlpSeqs = nlpc.docToNLPSequenceList(text) # io.mapListToCsv(outProjectFolder + '/nlp_seqs.csv', nlpSeqs) # nlpSeqs = nlpc.textToSimplifiedNLPSequenceList(text) # io.mapListToCsv(outProjectFolder + '/nlp_seqs_sim.csv', nlpSeqs) # for i in range(2,6): # nlpSeqs = nlpc.docToMaxDepthTreeNLPSequenceList(text, i) # io.mapListToCsv(outProjectFolder + '/nlp_seqs_tree_d'+str(i)+'.csv', nlpSeqs) nlpSeqs = nlpc.docToMaxDepthsTreeNLPSequenceList(doc, range(3, 7)) io.mapListToCsv(outProjectFolder + '/nlp_seqs_tree_depths.csv', nlpSeqs)
def dataSourceUnevenHeaderToNLP(headerSampleCount=10, writeHeader=True): df = pd.read_csv(ROOT + '/'+inFilename) sizeStr = str(len(df.index)) headerSampleMaplist = [] # get header from sampling # try Vectorization later if writeHeader: for index, row in df.iterrows(): if(index > headerSampleCount): break text = cleanText(row['content']) doc = nlpc.textToDoc(text) print(str(index) + '/' + sizeStr) treeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH) headerSampleMaplist.extend(treeDicts) sampledHeader = io.getHeaderFromMapList(headerSampleMaplist) # write header io.mapListToCsv(ROOT + '/results/' + outFilename, [], 'a', header=sampledHeader) for index, row in df.iterrows(): text = cleanText(row['content']) doc = nlpc.textToDoc(text) print(str(index) + '/' + sizeStr) treeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH) io.mapListToCsv(ROOT + '/results/' + outFilename, treeDicts, 'a', header=sampledHeader)
def run(): inFilename = inputFolder + '/' + project + '.txt' outProjectFolder = outputFolder + '/' + project text = io.textFileToString(inFilename) nlpDicts = [] sentNlpMapList = nlpc.textToNLPDictsList(text) nlpDicts += sentNlpMapList io.mapListToCsv(outProjectFolder + '/nlp_dicts.csv', nlpDicts) nlpSeqs = nlpc.textToNLPSequenceList(text) io.mapListToCsv(outProjectFolder + '/nlp_seqs.csv', nlpSeqs) nlpSeqs = nlpc.textToSimplifiedNLPSequenceList(text) io.mapListToCsv(outProjectFolder + '/nlp_seqs_sim.csv', nlpSeqs)
import sys import io_local.io as io import nlp.nlp_controller as nlpc inputFolder = 'C:/Users/John/Documents/pgi_dev/NLP_local_storage/inputs' outputFolder = 'C:/Users/John/Documents/pgi_dev/NLP_local_storage/outputs' print(sys.argv) project = sys.argv[1] inFilename = inputFolder + '/' + project + '.csv' outFilename = outputFolder + '/' + project + '.csv' res = [] textArrList = io.csvToList(project) print(textArrList) for textArr in textArrList: text = textArr[0] sentNlpMapList = nlpc.textToNLPDictsList(text) res += sentNlpMapList io.mapListToCsv(project, res)