コード例 #1
0
ファイル: build.py プロジェクト: FirstDraftGIS/firstdraft
def run():
  try:
    print "starting to build LSI Model"

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print "number_of_documents:", number_of_documents

    stopwords = []
    stopwords += [month.lower() for month in month_to_number.keys()]
    stopwords += nltk_stopwords.words('english')
    print "stopwords:", len(stopwords)
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])
    stopwords = set(stopwords)

    texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    dictionary = Dictionary(texts)
    print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print "corpus:", type(corpus)

    print "generating lsi model"
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print "saving LSI model"
    lsi.save(path_to_directory_of_this_file + "/model")

    Topic.objects.all().delete()
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)

  except Exception as e:
    print e
コード例 #2
0
from appfd.models import Feature, Place, Topic
from appfd.scripts.ai.lsi.get_topic import run as get_topic
from appfd.scripts.ai.lsi.build import run as build
from collections import Counter
from date_extractor import month_to_number
from datetime import datetime

stopwords = [month.lower() for month in list(month_to_number.keys())]


def run(places=None):

    try:

        start = datetime.now()

        if places is None:

            # will probably have to do a yield thing at some point
            #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True)
            places = Place.objects.exclude(featureplace=None)

            place_ids = Feature.objects.filter(
                featureplace__correct=True,
                featureplace__feature__verified=True).values_list(
                    "featureplace__place_id", flat=True)

        print("place_ids:", len(place_ids))
        for place_id in place_ids:

            counter = Counter()
コード例 #3
0
from appfd.models import Feature, Place, Topic
from appfd.scripts.ai.lsi.get_topic import run as get_topic
from appfd.scripts.ai.lsi.build import run as build
from collections import Counter
from date_extractor import month_to_number
from datetime import datetime

stopwords = [month.lower() for month in month_to_number.keys()]

def run(places=None):

    try:

        start = datetime.now()

        if places is None:

            # will probably have to do a yield thing at some point
            #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True)
            places = Place.objects.exclude(featureplace=None)

            place_ids = Feature.objects.filter(featureplace__correct=True, featureplace__feature__verified=True).values_list("featureplace__place_id", flat=True)

        print "place_ids:", len(place_ids)
        for place_id in place_ids:

            counter = Counter()
            for feature in Feature.objects.filter(verified=True, featureplace__place_id=place_id, featureplace__correct=True).exclude(text=None).exclude(text=""):
                if feature.text:
                    topic_id = get_topic(feature.text)
                    if topic_id: