Ejemplo n.º 1
0
def trainusingtext(text: str, window: int=2):
    cspace = hyperdimensionalsemanticspace.SemanticSpace()
    cspace.addoperator("before")
    cspace.addoperator("after")
    dspace = hyperdimensionalsemanticspace.SemanticSpace()
    sentences = sent_tokenize(text.lower())
    for sentence in sentences:
        ii = 0
        words = word_tokenize(sentence)
        for word in words:
            ii += 1
            dspace.observe(word)
            dspace.addintoitem(word, sentence)
            lhs = words[ii - window:ii]
            rhs = words[ii + 1:ii + window + 1]
            for lw in lhs:
                w = weight(lw)
                cspace.addintoitem(word, lw, w, "before")
            for rw in rhs:
                w = weight(rw)
                cspace.addintoitem(word, rw, w, "after")


#cspace.outputwordspace("context.wordspace")
Ejemplo n.º 2
0
datadirectory = "/home/jussi/data/vectorspace/"
# ===========================================================================
debug = False
monitor = True
error = True
dimensionality = 2000
denseness = 10
ngramwindow = 3
# ===========================================================================
languagemodel = languagemodel.LanguageModel()
languagemodel.importstats(datadirectory +
                          "bgwordfrequency1.list")  # insert file name here
# ===========================================================================
# files = simpletextfilereader.getfilelist(datadirectory, re.compile(r".*09*.i*"))

cspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness)

cspace.addoperator("before")
cspace.addoperator("after")

dspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness)


def weight(item: str):
    return languagemodel.frequencyweight(item, False)


def trainusingtext(text: str, window: int = 2):
    global ticker
    sentences = sent_tokenize(text.lower())  # type: str
    for sentence in sentences:
Ejemplo n.º 3
0
# compare polar terms with canonical terms in vector space
# establish if polar opposites have systematic correlation with canonical opposites

canonicalgood = ["good", "alive"]
canonicalbad = ["bad", "dead"]
canonicals = canonicalbad + canonicalgood
probegood = ["easy"]
probebad = ["difficult"]
probes = probebad + probegood
items = canonicals + probes

# for each polar term, build a utterance context vector

dimensionality = 2000
denseness = 10
contextspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness, "polarcanonical")
utterancespace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality, denseness, "polarcanonical")

#simpletextfilereader.readstats()
window = 2
files = simpletextfilereader.getfilelist()
i = 0
antalsatser = 0
antalord = 0
threshold = 0.1
for file in files:
    i += 1
    texts = simpletextfilereader.doonejsontextfile(file)
    flag = []
    for text in texts:
        ss = sent_tokenize(text.lower())
import hyperdimensionalsemanticspace
import sparsevectors

number = 10
negattitudewordset = set()
posattitudewordset = set()


vecs = hyperdimensionalsemanticspace.SemanticSpace()
wordspacedirectory = "/home/jussi/data/wordspaces/"
wordspacefile = "canonical.space.2017-09-05.EN.twitter.jq.irma"
apfile = "canonical.space.2.ap"
vecs.inputwordspace(wordspacedirectory + wordspacefile)

with open("/home/jussi/data/poles/en/enposBingLiu.list", "r") as posfile:
    line = posfile.readline()
    lineno = 0
    while line:
        lineno += 1
        word = line.rstrip()
        posattitudewordset.add(word)
        line = posfile.readline()

with open("/home/jussi/data/poles/en/ennegBingLiu.list", "r") as negfile:
    line = negfile.readline()
    lineno = 0
    while line:
        lineno += 1
        word = line.rstrip()
        negattitudewordset.add(word)
        line = negfile.readline()
import sparsevectors
import semanticdependencyparse
import hyperdimensionalsemanticspace

# take a file with an utterance per line
# process it one by one,
# - parse it
# - generate lexical vectors
# - never mind context vectors for now
# -
# return a vector per utterance / line

sentencestorage = {}
utterancespace = {}
textspace = {}
wordspace = hyperdimensionalsemanticspace.SemanticSpace()

debug = False
monitor = True
error = True


def processfile(file):
    global sentencestorage, utterancespace
    sentenceindex = 0
    textvector = wordspace.newemptyvector()
    with open(file, "r", encoding="utf-8") as textfile:
        rawtext = textfile.read().lower()
        rawtext = re.sub('\n', ' ', rawtext)
        rawtext = re.sub('\"', ' ', rawtext)
        rawtext = re.sub('\s+', ' ', rawtext)
Ejemplo n.º 6
0
vectors = {}

canonicalwordset = set()
attributewordset = set()
amplifierGwordset = set()
amplifierSwordset = set()
amplifierTwordset = set()
pragmaticswordset = set()
negattitudewordset = set()
posattitudewordset = set()
downtonerswordset = set()
negationwordset = set()
dim = 2000
den = 10
win = 0
space = hyperdimensionalsemanticspace.SemanticSpace(dim, den)
strings = stringsequencespace.StringSequenceSpace(dim, den, win)
testbatchsize = 10000
batch = 500
thresholdofinterest = 5
negationskipwindow = 4
amplifierdowntonerwindow = 4
datadirectory = "/home/jussi/data/storm/fixed/"
outputdirectory = "/home/jussi/data/wordspaces/"
resourcedirectory = "/home/jussi/data/poles/en/"


def redovisa(n, file="canonical.space"):
    print(n)
    for cw in canonicalwordset:
        try:
Ejemplo n.º 7
0
debug = True


def nop(dummy):
    # do nothing
    return None


outputfile = "/home/jussi/aktuellt/1.case/tammikuu/finnish-cases-output.txt"

dimensionality = 2000
denseness = 10

# tokens x words context 2x2
tokencontextspace = hyperdimensionalsemanticspace.SemanticSpace(
    dimensionality, denseness, "token vs wds, 2x2")
# tokens x words context sentence
tokenutterancespace = hyperdimensionalsemanticspace.SemanticSpace(
    dimensionality, denseness, "token vs wds, utt")
# tokens x words context 2x2
fullcontextspace = hyperdimensionalsemanticspace.SemanticSpace(
    dimensionality, denseness, "all token vs wds, 2x2")
# tokens x words context sentence
fullutterancespace = hyperdimensionalsemanticspace.SemanticSpace(
    dimensionality, denseness, "all token vs wds, utt")
# lemmas x cases context one token per entire corpus
lemmacasespace = hyperdimensionalsemanticspace.SemanticSpace(
    dimensionality, denseness, "lemma x case")
# lemmas x cases per text
# lemmacasetextspace = hyperdimensionalsemanticspace.SemanticSpace(dimensionality,denseness)