Example #1
0
__author__ = 'matias'

from textanalysis.irdatastructs import InvertedIndex
from matplotlib import pyplot as plt

entity_type = "disease"

index = InvertedIndex(entity_type)
index.load()

ranking = []
for term in index.index:
    ranking.append((term, len(set(index.index[term]))))

ranking.sort(key=lambda tup: tup[1])

count = 1
with open("%s_stopwords.txt" % (entity_type, ), 'w') as outfile:
    for e in ranking:
        print count, e
        if e[1] > 80:
            outfile.write("%s\n" % (e[0], ))
        count += 1

print len(ranking)

# plot IDF for all entity terms
plt.plot([1.0 / tup[1] for tup in ranking[:-1]])
plt.show()
Example #2
0
__author__ = 'matias'

from textanalysis.irdatastructs import InvertedIndex
from matplotlib import pyplot as plt

entity_type = "disease"

index = InvertedIndex(entity_type)
index.load()

ranking = []
for term in index.index:
    ranking.append((term, len(set(index.index[term]))))

ranking.sort(key=lambda tup:tup[1])

count = 1
with open("%s_stopwords.txt" % (entity_type,), 'w') as outfile:
    for e in ranking:
        print count, e
        if e[1] > 80:
            outfile.write("%s\n" % (e[0],))
        count += 1

print len(ranking)

# plot IDF for all entity terms
plt.plot([1.0/tup[1] for tup in ranking[:-1]])
plt.show()
Example #3
0
__author__ = 'matias'

from textanalysis.entityextractor import DiseaseExtractor, SymptomExtractor
from textanalysis.texts import CaseReportLibrary
from textanalysis.irdatastructs import InvertedIndex

d_index = InvertedIndex("disease")
s_index = InvertedIndex("symptom")

cases = CaseReportLibrary()
d_extractor = DiseaseExtractor()
s_extractor = SymptomExtractor()

count = 0
max_count = 50000
for case in cases:
    text = case.get_text()
    count += 1
    symptoms = list(set(s_extractor.extract(text)))
    diseases = list(set(d_extractor.extract(text)))
    s_index.add(symptoms,count)
    d_index.add(diseases,count)
    if count >= max_count:
        break
    print count,"/",max_count
    print symptoms + diseases

s_index.save()
d_index.save()
Example #4
0
__author__ = 'matias'

from textanalysis.entityextractor import DiseaseExtractor, SymptomExtractor
from textanalysis.texts import CaseReportLibrary
from textanalysis.irdatastructs import InvertedIndex

d_index = InvertedIndex("disease")
s_index = InvertedIndex("symptom")

cases = CaseReportLibrary()
d_extractor = DiseaseExtractor()
s_extractor = SymptomExtractor()

count = 0
max_count = 50000
for case in cases:
    text = case.get_text()
    count += 1
    symptoms = list(set(s_extractor.extract(text)))
    diseases = list(set(d_extractor.extract(text)))
    s_index.add(symptoms, count)
    d_index.add(diseases, count)
    if count >= max_count:
        break
    print count, "/", max_count
    print symptoms + diseases

s_index.save()
d_index.save()