def extractConceptst(outputfile,listbooks_concepts,preprocessed_concepts=[],notprocessed_concepts=[],topcount=1): iCount = 0 l_concepts = {} for pconceptfolders in preprocessed_concepts: for file in os.listdir(CONCEPT_FOLDER_BASE+pconceptfolders): # print(pconceptfolders) if file.endswith("phrases") and file.startswith(tuple(listbooks_concepts)): # print(file) fnamek = file.replace(".txt.phrases","") if(fnamek not in l_concepts): l_concepts[fnamek] = set() iCount = 0 for line in open(CONCEPT_FOLDER_BASE+pconceptfolders+"/"+file,'r').readlines(): if(len(line.strip()) > 1): concept = line.split(",")[0].replace(" ","_") tfidf = float(line.split(",")[1].strip()) if iCount < topcount: iCount += 1 l_concepts[fnamek].add(concept) for notpconceptfolder in notprocessed_concepts: for file in os.listdir(CONCEPT_FOLDER_BASE+notpconceptfolder): if file.endswith("phrases") and file.startswith(tuple(listbooks_concepts)): fnamek = file.replace(".txt.phrases","") if(fnamek not in l_concepts): l_concepts[fnamek] = set() for line in open(CONCEPT_FOLDER_BASE+notpconceptfolder+"/"+file,'r').readlines(): pconcept = preprocessText(line.split(",")[0]) l_concepts[fnamek].add('_'.join(pconcept)) documents = load_document(IR_CORPUS,listbooks_concepts) fcsv = open(outputfile,'w') lConcept_len = [] for doc in documents: if doc.id not in l_concepts: l_concepts[doc.id] = [] lConcept_len.append(len(l_concepts[doc.id])) fcsv.write(doc.id.replace(" " ,"_")+" "+' '.join(l_concepts[doc.id]).replace("\n","").replace("\t","")+"\n") fcsv.write(' '.join(preprocessText(doc.text,stemming=False,stopwords_removal=False))+"\n") return l_concepts,lConcept_len
def filltokendict(document, category=None): doc = document.text tokens = preprocessText(doc, stemming=False, stopwords_removal=True) doc_concepts = set() ngrams = nltk.ngrams(tokens, n=6) for ngram in ngrams: token = stem(ngram[0]) # print(ngram[0]) if token in l_concepts: if token in conceptdocs: conceptdocs[token] += ngram[1:] else: conceptdocs[token] = list(ngram[1:]) doc_concepts.add(token) token = stem(ngram[5]) if token in l_concepts: if token in conceptdocs: conceptdocs[token] += ngram[:5] # print(ngram[:4]) else: conceptdocs[token] = list(ngram[:5]) doc_concepts.add(token) ngrams = nltk.ngrams(tokens, n=7) for ngram in ngrams: token = ' '.join([stem(ngram[5]), stem(ngram[6])]) # print(token) if token in l_concepts: # print(token) if token in conceptdocs: conceptdocs[token] += ngram[:5] else: conceptdocs[token] = list(ngram[:5]) # print(conceptdocs[token]) doc_concepts.add(token) token = ' '.join([stem(ngram[0]), stem(ngram[1])]) if token in l_concepts: # print(token) if token in conceptdocs: conceptdocs[token] += ngram[2:] else: conceptdocs[token] = list(ngram[2:]) # print(conceptdocs[token]) doc_concepts.add(token) ngrams = nltk.ngrams(tokens, n=8) for ngram in ngrams: token = ' '.join([stem(ngram[5]), stem(ngram[6]), stem(ngram[7])]) # print(token) if token in l_concepts: # print(token) if token in conceptdocs: conceptdocs[token] += ngram[:5] else: conceptdocs[token] = list(ngram[:5]) # print(conceptdocs[token]) doc_concepts.add(token) token = ' '.join([stem(ngram[0]), stem(ngram[1]), stem(ngram[2])]) if token in l_concepts: if token in conceptdocs: conceptdocs[token] += ngram[3:] else: conceptdocs[token] = list(ngram[3:]) # print(conceptdocs[token]) doc_concepts.add(token) if (category != None and category != ""): for concept in doc_concepts: if (concept in conceptcategory): conceptcategory[concept].append(category) else: conceptcategory[concept] = [category]
for line in open( CONCEPT_FOLDER_BASE + pconceptfolders + "/" + file, 'r').readlines(): concept = line.split(",")[0] l_concepts.add(concept) for notpconceptfolder in notprocessed_concepts: for file in os.listdir(CONCEPT_FOLDER_BASE + notpconceptfolder): fwrite = open( CONCEPT_FOLDER_BASE + notpconceptfolder + "_stem/" + file, 'w') if file.endswith("phrases") and file.startswith( tuple(listbooks_concepts)): for line in open( CONCEPT_FOLDER_BASE + notpconceptfolder + "/" + file, 'r').readlines(): pconcept = preprocessText(line.split(",")[0]) fwrite.write(' '.join(pconcept) + "\n") l_concepts.add(' '.join(pconcept)) fconcepts = ['data/file2.txt', 'data/file3.txt'] ## From Files for cfile in fconcepts: for line in open(cfile).readlines(): l_concepts.add(line.strip()) category_file = 'data/chapterwise_title.csv' chapterwise_titledict = conceptcategories(category_file) listbooks = ['irv-', 'issr-', 'foa-', 'sigir-', 'zhai-', 'seirip-', 'wiki-'] # listbooks = ['sigir']
'r').readlines(): concept = line.split(",")[0].replace(" ", "_") l_concepts[fnamek].add(concept) for notpconceptfolder in notprocessed_concepts: for file in os.listdir(CONCEPT_FOLDER_BASE + notpconceptfolder): if file.endswith("phrases") and file.startswith( tuple(listbooks_concepts)): fnamek = file.replace(".txt.phrases", "") if (fnamek not in l_concepts): l_concepts[fnamek] = set() for line in open( CONCEPT_FOLDER_BASE + notpconceptfolder + "/" + file, 'r').readlines(): pconcept = preprocessText(line.split(",")[0]) l_concepts[fnamek].add('_'.join(pconcept)) IR_CORPUS = 'data/keyphrase/textbook/all_text.csv' documents = load_document(IR_CORPUS, listbooks_concepts) outputfile = 'doc2tagtrain_nostopwords_nostem.csv' fcsv = open(outputfile, 'w') for doc in documents: if doc.id in l_concepts: # print(' '.join(l_concepts[doc.id]).replace("\n","").replace("\t","")+"\n") fcsv.write( doc.id.replace(" ", "_") + " " + ' '.join(l_concepts[doc.id]).replace("\n", "").replace("\t", "") +