def load_bnc_word_freqs(): # filename = "/home/rob/git/thinklink/reference/bnc_corpus_all.num.o5.txt" docfreqs = {} for line in localfile("data/bnc_corpus_all.num.o5.txt"): termfreq,term,type,docfreq = line.strip().split(" ") if term not in docfreqs: # TODO: not sure if this is correct docfreqs[term] = int(docfreq) return docfreqs
def load_bnc_word_freqs(): # filename = "/home/rob/git/thinklink/reference/bnc_corpus_all.num.o5.txt" docfreqs = {} for line in localfile("data/bnc_corpus_all.num.o5.txt"): termfreq, term, type, docfreq = line.strip().split(" ") if term not in docfreqs: # TODO: not sure if this is correct docfreqs[term] = int(docfreq) return docfreqs
def load_ranges(ranges): return pickle.load(settings.localfile("data/svm_range.range","w"))
def save_ranges(ranges): outfile = settings.localfile("data/svm_range.range","w") pickle.dump(ranges,outfile) outfile.close()
import urllib2 import settings from nlptools.html_to_text import html_to_text import re def tokenize(claim): return re.split("(\W+)", claim) def sentences(text): return text.split(".") firstwords_keys = set( settings.localfile("data/firstwords_keys.list").read().split("|")) #TODO: replace caches with limited-size caches that throw away data when they get too big firstwords_cache = {} def get_firstwords(first): if not first in firstwords_cache: try: firstwords_cache[first] = FirstWords.objects.get( firstword=first).secondwords_set() except: firstwords_cache[first] = set() return firstwords_cache[first]
def load_ranges(ranges): return pickle.load(settings.localfile("data/svm_range.range", "w"))
def save_ranges(ranges): outfile = settings.localfile("data/svm_range.range", "w") pickle.dump(ranges, outfile) outfile.close()
from urlcheck.models import FirstWords, WordPair, WordTriple import urllib2 import settings from nlptools.html_to_text import html_to_text import re def tokenize(claim): return re.split("(\W+)",claim) def sentences(text): return text.split(".") firstwords_keys = set(settings.localfile("data/firstwords_keys.list").read().split("|")) #TODO: replace caches with limited-size caches that throw away data when they get too big firstwords_cache = {} def get_firstwords(first): if not first in firstwords_cache: try: firstwords_cache[first] = FirstWords.objects.get(firstword=first).secondwords_set() except: firstwords_cache[first] = set() return firstwords_cache[first] wordpair_cache = {} def get_wordpair(pair): if not pair in wordpair_cache: try: pairobj = WordPair.objects.get(pair=pair) wordpair_cache[pair] = (pairobj.triples_set(),pairobj.claims_list()) except: