def trainusingtext(text: str, window: int=2): cspace = hyperdimensionalsemanticspace.SemanticSpace() cspace.addoperator("before") cspace.addoperator("after") dspace = hyperdimensionalsemanticspace.SemanticSpace() sentences = sent_tokenize(text.lower()) for sentence in sentences: ii = 0 words = word_tokenize(sentence) for word in words: ii += 1 dspace.observe(word) dspace.addintoitem(word, sentence) lhs = words[ii - window:ii] rhs = words[ii + 1:ii + window + 1] for lw in lhs: w = weight(lw) cspace.addintoitem(word, lw, w, "before") for rw in rhs: w = weight(rw) cspace.addintoitem(word, rw, w, "after") #cspace.outputwordspace("context.wordspace")
datadirectory = "/home/jussi/data/vectorspace/" # =========================================================================== debug = False monitor = True error = True dimensionality = 2000 denseness = 10 ngramwindow = 3 # =========================================================================== languagemodel = languagemodel.LanguageModel() languagemodel.importstats(datadirectory + "bgwordfrequency1.list") # insert file name here # =========================================================================== # files = simpletextfilereader.getfilelist(datadirectory, re.compile(r".*09*.i*")) cspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness) cspace.addoperator("before") cspace.addoperator("after") dspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness) def weight(item: str): return languagemodel.frequencyweight(item, False) def trainusingtext(text: str, window: int = 2): global ticker sentences = sent_tokenize(text.lower()) # type: str for sentence in sentences:
# compare polar terms with canonical terms in vector space # establish if polar opposites have systematic correlation with canonical opposites canonicalgood = ["good", "alive"] canonicalbad = ["bad", "dead"] canonicals = canonicalbad + canonicalgood probegood = ["easy"] probebad = ["difficult"] probes = probebad + probegood items = canonicals + probes # for each polar term, build a utterance context vector dimensionality = 2000 denseness = 10 contextspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness, "polarcanonical") utterancespace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness, "polarcanonical") #simpletextfilereader.readstats() window = 2 files = simpletextfilereader.getfilelist() i = 0 antalsatser = 0 antalord = 0 threshold = 0.1 for file in files: i += 1 texts = simpletextfilereader.doonejsontextfile(file) flag = [] for text in texts: ss = sent_tokenize(text.lower())
import hyperdimensionalsemanticspace import sparsevectors number = 10 negattitudewordset = set() posattitudewordset = set() vecs = hyperdimensionalsemanticspace.SemanticSpace() wordspacedirectory = "/home/jussi/data/wordspaces/" wordspacefile = "canonical.space.2017-09-05.EN.twitter.jq.irma" apfile = "canonical.space.2.ap" vecs.inputwordspace(wordspacedirectory + wordspacefile) with open("/home/jussi/data/poles/en/enposBingLiu.list", "r") as posfile: line = posfile.readline() lineno = 0 while line: lineno += 1 word = line.rstrip() posattitudewordset.add(word) line = posfile.readline() with open("/home/jussi/data/poles/en/ennegBingLiu.list", "r") as negfile: line = negfile.readline() lineno = 0 while line: lineno += 1 word = line.rstrip() negattitudewordset.add(word) line = negfile.readline()
import sparsevectors import semanticdependencyparse import hyperdimensionalsemanticspace # take a file with an utterance per line # process it one by one, # - parse it # - generate lexical vectors # - never mind context vectors for now # - # return a vector per utterance / line sentencestorage = {} utterancespace = {} textspace = {} wordspace = hyperdimensionalsemanticspace.SemanticSpace() debug = False monitor = True error = True def processfile(file): global sentencestorage, utterancespace sentenceindex = 0 textvector = wordspace.newemptyvector() with open(file, "r", encoding="utf-8") as textfile: rawtext = textfile.read().lower() rawtext = re.sub('\n', ' ', rawtext) rawtext = re.sub('\"', ' ', rawtext) rawtext = re.sub('\s+', ' ', rawtext)
vectors = {} canonicalwordset = set() attributewordset = set() amplifierGwordset = set() amplifierSwordset = set() amplifierTwordset = set() pragmaticswordset = set() negattitudewordset = set() posattitudewordset = set() downtonerswordset = set() negationwordset = set() dim = 2000 den = 10 win = 0 space = hyperdimensionalsemanticspace.SemanticSpace(dim, den) strings = stringsequencespace.StringSequenceSpace(dim, den, win) testbatchsize = 10000 batch = 500 thresholdofinterest = 5 negationskipwindow = 4 amplifierdowntonerwindow = 4 datadirectory = "/home/jussi/data/storm/fixed/" outputdirectory = "/home/jussi/data/wordspaces/" resourcedirectory = "/home/jussi/data/poles/en/" def redovisa(n, file="canonical.space"): print(n) for cw in canonicalwordset: try:
debug = True def nop(dummy): # do nothing return None outputfile = "/home/jussi/aktuellt/1.case/tammikuu/finnish-cases-output.txt" dimensionality = 2000 denseness = 10 # tokens x words context 2x2 tokencontextspace = hyperdimensionalsemanticspace.SemanticSpace( dimensionality, denseness, "token vs wds, 2x2") # tokens x words context sentence tokenutterancespace = hyperdimensionalsemanticspace.SemanticSpace( dimensionality, denseness, "token vs wds, utt") # tokens x words context 2x2 fullcontextspace = hyperdimensionalsemanticspace.SemanticSpace( dimensionality, denseness, "all token vs wds, 2x2") # tokens x words context sentence fullutterancespace = hyperdimensionalsemanticspace.SemanticSpace( dimensionality, denseness, "all token vs wds, utt") # lemmas x cases context one token per entire corpus lemmacasespace = hyperdimensionalsemanticspace.SemanticSpace( dimensionality, denseness, "lemma x case") # lemmas x cases per text # lemmacasetextspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality,denseness)