Exemple #1
0
 def textvector(self, string, frequencyweighting=True, loglevel=False):
     uvector = sparsevectors.newemptyvector(self.dimensionality)
     if self.window > 0:
         windows = [string[ii:ii + self.window] for ii in range(len(string) - self.window + 1)]
         for sequence in windows:
             thisvector = self.makevector(sequence)
             if frequencyweighting:
                 factor = self.frequencyweight(sequence)
             else:
                 factor = 1
             logger(sequence + " " + str(factor), loglevel)
             if loglevel:
                 logger(str(sparsevectors.sparsecosine(uvector, sparsevectors.normalise(thisvector))), loglevel)
             uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(thisvector), factor)
     else:
         words = nltk.word_tokenize(string)
         if self.binaryfrequencies:
             wordlist = set(words)  # not a list, a set but hey
         else:
             wordlist = words
         for w in wordlist:
             if frequencyweighting:
                 factor = self.frequencyweight(w)
             else:
                 factor = 1
             if w not in self.indexspace:
                 self.additem(w)
             else:
                 self.observe(w)
             uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(self.indexspace[w]), factor)
     return uvector
def processfile(file):
    global sentencestorage, utterancespace
    sentenceindex = 0
    textvector = wordspace.newemptyvector()
    with open(file, "r", encoding="utf-8") as textfile:
        rawtext = textfile.read().lower()
        rawtext = re.sub('\n', ' ', rawtext)
        rawtext = re.sub('\"', ' ', rawtext)
        rawtext = re.sub('\s+', ' ', rawtext)
        sents = sent_tokenize(rawtext)
        for sentence in sents:
            sentenceindex += 1
            sentencestorage[sentenceindex] = sentence
            allsurfacewords = nltk.word_tokenize(sentence)
            wordspace.chkwordspace(allsurfacewords, debug)
            analyses = []
            try:
                analyses = semanticdependencyparse.semanticdepparse(
                    sentence.lower(), debug)
            except:
                logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence,
                       error)
            kk = 0
            for analysis in analyses:
                words = analysis.values()
                wordspace.checkwordspacelist(words, debug)
                for role in analysis:
                    if role not in wordspace.permutationcollection:
                        wordspace.permutationcollection[
                            role] = sparsevectors.createpermutation(
                                wordspace.dimensionality)
                u = getvector(analysis, sentence)
                win = 1
                sentencesequence = 0
                startindexforthistext = 0
                while win < sentencesequence:
                    if sentenceindex - win > startindexforthistext:
                        u = sparsevectors.sparseadd(
                            u,
                            sparsevectors.permute(
                                sparsevectors.normalise(
                                    utterancespace[sentenceindex - win]),
                                wordspace.permutationcollection["discourse"]))
                    win += 1
                if kk > 0:
                    sentenceindex += 1
                utterancespace[sentenceindex] = u
                textvector = sparsevectors.sparseadd(textvector, u, 1)
                kk += 1
        textspace[file] = textvector
    return textvector
Exemple #3
0
 def observecollocation(self, item, otheritem, operator="nil"):
     if not self.contains(item):
         self.additem(item)
     if not self.contains(otheritem):
         self.additem(otheritem)
     self.contextspace[item] = sparsevectors.sparseadd(
         self.contextspace[item],
         sparsevectors.normalise(self.indexspace[otheritem]))
Exemple #4
0
 def applyoperator(self, item, operator, constant, weight):
     self.contextspace[item] = sparsevectors.sparseadd(
         self.contextspace[item],
         sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant],
                                                       self.permutationcollection[operator])),
         weight)
     if operator == "morphology":
                 self.morphologyspace[item] = sparsevectors.sparseadd(
                     self.morphologyspace[item],
                     sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant],
                     self.permutationcollection[operator])),
                     weight)
     else:
         self.attributespace[item] = sparsevectors.sparseadd(
             self.attributespace[item],
             sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant],
                                                       self.permutationcollection[operator])),
             weight)
Exemple #5
0
 def addintoitem(self, item, vector, weight=1, operator=None):
     if not self.contains(item):
         self.additem(item)
     if operator is not None:
         vector = sparsevectors.permute(vector, operator)
     self.contextspace[item] = sparsevectors.sparseadd(self.contextspace[item],
                                                       sparsevectors.normalise(vector),
                                                       weight)
     self.changed = True
Exemple #6
0
 def addintoitem(self, item, vector, weight=1):
     if not self.contains(item):
         vector = sparsevectors.newrandomvector(self.dimensionality,
                                                self.denseness)
         self.indexspace[item] = vector
         self.globalfrequency[item] = 0
         self.contextspace[item] = sparsevectors.newemptyvector(
             self.dimensionality)
     self.contextspace[item] = \
         sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight)
 def sequencevector(self, sequence, initialvector=None, loglevel=False):
     if initialvector is None:
         initialvector = sparsevectors.newemptyvector(self.dimensionality)
     windowlist = self.windows(sequence)
     logger(str(windowlist), loglevel)
     for w in windowlist:
         initialvector = sparsevectors.sparseadd(
             initialvector,
             sparsevectors.normalise(
                 self.onesequencevector(w, None, loglevel)))
     return initialvector
Exemple #8
0
 def postriplevector(self, text, poswindow=3):
     poses = nltk.pos_tag(text)
     windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)]
     onevector = self.pospermutations["vector"]
     vector = sparsevectors.newemptyvector(self.dimensionality)
     for sequence in windows:
         for item in sequence:
             if item[1] not in self.pospermutations:
                 self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality)
             onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]])
         vector = sparsevectors.sparseadd(vector, onevector)
     return vector
def processsentences(sents, testing=True):
    global sentencerepository, vectorrepositoryidx, featurerepository, index, ticker, sequencelabels, vectorrepositoryseq
    for s in sents:
        index += 1
        key = "s" + str(index)
        if s in sentencerepository.values():
            continue
        fs = featurise(s)
        logger(s, debug)
        fcxg = fs["features"]
        fpos = fs["pos"]
        fsem = fs["roles"]
        fwds = fs["words"]
        logger(fwds, debug)
        logger(fpos, debug)
        logger(fcxg, debug)
        logger(fsem, debug)
        vecidx = tokenvector(fwds, None, True, debug)
        vecseq = seq.sequencevector(fpos, None, debug)
        vecis = sparsevectors.sparseadd(vecidx, vecseq, 1, True)
        logger("idx - comb\t" + str(sparsevectors.sparsecosine(vecidx, vecis)),
               debug)
        logger("seq - comb\t" + str(sparsevectors.sparsecosine(vecseq, vecis)),
               debug)
        veccxg = tokenvector(fcxg, vecis, False, debug)
        logger("comb - cxg\t" + str(sparsevectors.sparsecosine(vecis, veccxg)),
               debug)
        logger("idx - cxg\t" + str(sparsevectors.sparsecosine(vecidx, veccxg)),
               debug)
        logger("seq - cxg\t" + str(sparsevectors.sparsecosine(veccxg, vecseq)),
               debug)
        vecsem = rolevector(fsem, veccxg, debug)
        logger("idx - sem\t" + str(sparsevectors.sparsecosine(vecidx, vecsem)),
               debug)
        logger("seq - sem\t" + str(sparsevectors.sparsecosine(vecseq, vecsem)),
               debug)
        logger("comb - sem\t" + str(sparsevectors.sparsecosine(vecis, vecsem)),
               debug)
        logger("cxg - sem\t" + str(sparsevectors.sparsecosine(veccxg, vecsem)),
               debug)
        sentencerepository[key] = s
        vectorrepositoryidx[key] = vecidx
        vectorrepositoryseq[key] = vecseq
        vectorrepositorycxg[key] = veccxg
        vectorrepositorysem[key] = vecsem
        featurerepository[key] = fs
        logger(str(key) + ":" + str(s) + "->" + str(fs), debug)
        if ticker > 1000:
            logger(str(index) + " sentences processed", monitor)
            squintinglinguist.restartCoreNlpClient()
            ticker = 0
        ticker += 1
Exemple #10
0
def rolevector(roledict, initialvector=None, loglevel=False):
    if initialvector is None:
        initialvector = sparsevectors.newemptyvector(dimensionality)
    for role in roledict:
        for item in roledict[role]:
            ctxspace.observe(item, False, debug)
            tmp = initialvector
            initialvector = sparsevectors.sparseadd(
                initialvector,
                sparsevectors.normalise(
                    ctxspace.useoperator(ctxspace.indexspace[item], role)))
            if loglevel:
                logger(
                    role + " " + item + " " +
                    str(sparsevectors.sparsecosine(tmp, initialvector)),
                    loglevel)
    return initialvector
def tweetvector(string):
    uvector = sparsevectors.newemptyvector(ngramspace.dimensionality)
    if window > 0:
        windows = [
            string[ii:ii + window] for ii in range(len(string) - window + 1)
        ]
        for sequence in windows:
            if ngramspace.contains(sequence):
                thisvector = ngramspace.indexspace[sequence]
#                 ngramspace.observe(sequence)  # should we be learning stuff now? naaw.
            else:
                thisvector = stringspace.makevector(sequence)


#                ngramspace.additem(sequence, thisvector)  # should it be added to cache? naaw.
            factor = ngramspace.frequencyweight(sequence)
            uvector = sparsevectors.sparseadd(
                uvector, sparsevectors.normalise(thisvector), factor)
    return uvector
Exemple #12
0
 def makevector(self, string):
     stringvector = {}  # np.array([0] * self.dimensionality)
     for character in string[::-1]:    # reverse the string! (to keep strings that share prefixes similar)
         if character not in self.indexspace:
             vec = {}
             nonzeros = random.sample(list(range(self.dimensionality)), self.denseness)
             random.shuffle(nonzeros)
             split = self.denseness // 2
             for i in nonzeros[:split]:
                 vec[i] = 1
             for i in nonzeros[split:]:
                 vec[i] = -1
             self.indexspace[character] = vec
             self.globalfrequency[character] = 1
             self.bign += 1
         stringvector = sparsevectors.sparseadd(sparsevectors.sparseshift(stringvector, self.dimensionality),
                                                self.indexspace[character])
         # np.append(stringvector[1:], stringvector[0]) + self.indexspace[character]
     return stringvector  # lil_matrix(stringvector.reshape(self.dimensionality, -1))
Exemple #13
0
def tokenvector(tokenlist, initialvector=None, weights=True, loglevel=False):
    if initialvector is None:
        initialvector = sparsevectors.newemptyvector(dimensionality)
    for item in tokenlist:
        if not weights or str(item).startswith(
                "JiK"
        ):  # cxg features should not be weighted the same way lex feats are
            weight = 1
        else:
            weight = ctxspace.languagemodel.frequencyweight(item, True)
        ctxspace.observe(item, True)
        tmp = initialvector
        initialvector = sparsevectors.sparseadd(
            initialvector,
            sparsevectors.normalise(ctxspace.contextspace[item]), weight)
        if loglevel:
            logger(
                item + " " + str(weight) + " " +
                str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel)
    return initialvector
Exemple #14
0
 def textvector(self,
                words,
                frequencyweighting=True,
                binaryfrequencies=False,
                loglevel=False):
     self.docs += 1
     uvector = sparsevectors.newemptyvector(self.dimensionality)
     if binaryfrequencies:
         wordlist = set(words)  # not a list, a set but hey
     else:
         wordlist = words
     for w in wordlist:
         if frequencyweighting:
             factor = self.frequencyweight(w)
         else:
             factor = 1
         if w not in self.indexspace:
             self.additem(w)
         else:
             self.observe(w)
         self.df[w] += 1
         uvector = sparsevectors.sparseadd(
             uvector, sparsevectors.normalise(self.indexspace[w]), factor)
     return uvector
def getvector(roleworddict, sentencestring):
    uvector = {}  # vector for test item
    for role in roleworddict:
        item = roleworddict[role]
        uvector = sparsevectors.sparseadd(
            uvector,
            sparsevectors.permute(
                sparsevectors.normalise(wordspace.indexspace[item]),
                wordspace.permutationcollection[role]),
            wordspace.frequencyweight(item))
    lexicalwindow = 1
    if lexicalwindow > 0:
        wds = word_tokenize(sentencestring.lower())
        windows = [
            wds[i:i + lexicalwindow]
            for i in range(len(wds) - lexicalwindow + 1)
        ]
        for sequence in windows:
            thisvector = {}
            for item in sequence:
                thisvector = sparsevectors.sparseadd(
                    sparsevectors.permute(
                        thisvector,
                        wordspace.permutationcollection["sequence"]),
                    wordspace.indexspace[item],
                    wordspace.frequencyweight(item))
            uvector = sparsevectors.sparseadd(
                uvector, sparsevectors.normalise(thisvector))
    pos = 1
    if pos > 0:
        wds = word_tokenize(sentencestring)
        posanalyses = nltk.pos_tag(wds)
        poslist = [i[1] for i in posanalyses]
        windows = [
            poslist[i:i + lexicalwindow]
            for i in range(len(poslist) - lexicalwindow + 1)
        ]
        for sequence in windows:
            thisvector = {}
            for item in sequence:
                thisvector = sparsevectors.sparseadd(
                    sparsevectors.permute(
                        thisvector,
                        wordspace.permutationcollection["sequence"]),
                    wordspace.indexspace[item],
                    wordspace.frequencyweight(item))
            uvector = sparsevectors.sparseadd(
                uvector, sparsevectors.normalise(thisvector))
    style = True
    if style:
        wds = word_tokenize(sentencestring)
        cpw = len(sentencestring) / len(wds)
        wps = len(wds)
        sl = True
        if sl:
            if wps > 8:
                uvector = sparsevectors.sparseadd(uvector, longsentencevector)
            if wps < 5:
                uvector = sparsevectors.sparseadd(uvector, shortsentencevector)
        posanalyses = nltk.pos_tag(wds)
        poslist = [i[1] for i in posanalyses]
        for poses in poslist:
            if poses == "RB" or poses == "RBR" or poses == "RBS":
                uvector = sparsevectors.sparseadd(uvector, adverbvector)
        for w in wds:
            if w in negationlist:
                uvector = sparsevectors.sparseadd(uvector, negationvector)
            if w in hedgelist:
                uvector = sparsevectors.sparseadd(uvector, hedgevector)
            if w in amplifierlist:
                uvector = sparsevectors.sparseadd(uvector, amplifiervector)

    # attitude terms
    # verb stats
    # seq newordgrams
    # verb classes use wordspace!
    # sent sequences
    return uvector
                print(probe, mc, n[mc], sentencerepository[mc])
        print(space.contexttoindexneighbourswithweights(probe))

for v in vectorrepository:
    print(v, sentencerepository[v], sep="\t", end="\t")
    #    print(v, vectorrepository[v])
    ww = nltk.word_tokenize(sentencerepository[v])
    vec = sparsevectors.newemptyvector(dimensionality)
    #    for www in ww:
    #        print(www, space.indexspace[www], space.globalfrequency[www], space.frequencyweight(www), sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v]))
    nvn = {}
    for www in ww:
        nvn[www] = sparsevectors.sparsecosine(space.indexspace[www],
                                              vectorrepository[v])
        vec = sparsevectors.sparseadd(
            vec, sparsevectors.normalise(space.indexspace[www]),
            space.frequencyweight(www))
    m = sorted(ww, key=lambda k: nvn[k], reverse=True)[:5]
    for mc in m:
        if nvn[mc] > 0.0001:
            print(mc, nvn[mc], sep=":", end="\t")
    print()

if False:
    for w in space.items():
        print(w, space.globalfrequency[w], space.indexspace[w], sep="\t")
        print("\t\t", space.contextspace[w])

# show that constructional items work the same way

# show that permuted semantic roles work "semantic grep"
            targetspace[textindex] = sparsevectors.newemptyvector(
                ngramspace.dimensionality)
            categorytable[textindex] = facittable[
                authornametable[authorindex]]  # name space collision for keys
        avector = tweetvector(b.text)
        thesevectors.append((targetlabel, avector))

    if len(thesevectors) > 0:
        random.shuffle(thesevectors)
        split = int(len(thesevectors) * testtrainfraction)
        testvectors[authorindex] = thesevectors[:split]
        testvectorantal += len(testvectors[authorindex])
        trainvectors[authorindex] = thesevectors[split:]
        trainvectorantal += len(trainvectors[authorindex])
        for tv in trainvectors[authorindex]:
            targetspace[tv[0]] = sparsevectors.sparseadd(
                targetspace[tv[0]], tv[1])
logger("Done training files.", monitor)

if outputmodel:
    # output character patterns to be able to generate new tweetvectors for separate testing on trained data
    stringspace.saveelementspace(charactervectorspacefilename)
    # output model here with info about the category of each model item
    with open(categorymodelfilename, "wb") as outfile:
        pickle.dump(targetspace, outfile)

logger(
    "Testing targetspace with " + str(len(targetspace)) + " categories, " +
    str(testvectorantal) + " test items and " + str(trainvectorantal) +
    " training cases. ", monitor)

confusion = ConfusionMatrix()
e = xml.etree.ElementTree.parse(textfile).getroot()
for b in e.iter("document"):
    textindex += 1
    tvector = sparsevectors.normalise(
        stringspace.textvector(b.text, frequencyweighting))
    textspace.additem(textindex, tvector)
    newtext = squintinglinguist.generalise(b.text)
    mvector = sparsevectors.normalise(
        stringspace.textvector(newtext, frequencyweighting))
    modifiedtextspace.additem(textindex, mvector)
    features = squintinglinguist.featurise(b.text)
    fvector = sparsevectors.newemptyvector(dimensionality)
    for feature in features:
        fv = stringspace.getvector(feature)
        fvector = sparsevectors.sparseadd(fvector, sparsevectors.normalise(fv),
                                          stringspace.frequencyweight(feature))
    fvector = sparsevectors.normalise(fvector)
    squintfeaturespace.additem(textindex, fvector)
    pvector = sparsevectors.normalise(stringspace.postriplevector(b.text))
    avector = sparsevectors.sparseadd(
        pvector,
        sparsevectors.sparseadd(mvector,
                                sparsevectors.sparseadd(fvector, tvector)))
    fullspace.additem(textindex, avector)
    textdepot[textindex] = b.text
    modifiedtextdepot[textindex] = newtext
    featuredepot[textindex] = features
logger("Done making " + str(textindex) + " vectors.", monitor)

matrix = False
if matrix:
    authorname = file.split(".")[0].split("/")[-1]
    authorindex += 1
    logger("Reading " + str(authorindex) + " " + file, monitor)
    workingvector = sparsevectors.newemptyvector(dimensionality)
    e = xml.etree.ElementTree.parse(file).getroot()

    for b in e.iter("document"):
        origtext = b.text
        avector = sparsevectors.newemptyvector(dimensionality)
        if fulltext:
            avector = sparsevectors.normalise(
                stringspace.textvector(origtext, frequencyweighting))
        if generalise:
            newtext = squintinglinguist.generalise(origtext)
            avector = sparsevectors.sparseadd(
                avector,
                sparsevectors.normalise(
                    stringspace.textvector(newtext, frequencyweighting)))
        if featurise:
            features = squintinglinguist.featurise(origtext)
            for feature in features:
                fv = stringspace.getvector(feature)
                avector = sparsevectors.sparseadd(
                    avector, sparsevectors.normalise(fv),
                    stringspace.frequencyweight(feature))
        if postriples:
            posttriplevector = stringspace.postriplevector(origtext)
            avector = sparsevectors.sparseadd(
                avector, sparsevectors.normalise(posttriplevector))
        workingvector = sparsevectors.sparseadd(
            workingvector, sparsevectors.normalise(avector))
    nn += 1
def doallthefiles(rangelimit=4000):
    filelist = {}
    seenfile = {}
    antal_frag = 0
    for ix in range(rangelimit):
        filelist[ix] = {}
        seenfile[ix] = True
        for cat in categories:
            fn = "{}{}.of_{:0>4d}.json.txt".format(path, cat, ix)
            try:
                os.stat(fn)
                filelist[ix][cat] = fn
            except:
                seenfile[ix] = None
                filelist[ix][cat] = None
                del filelist[ix]
                logger(
                    "index {} did not match up {} file: {}".format(
                        ix, cat, fn), error)
    logger("antal filer: {}".format(len(filelist)), monitor)
    conditions = ["wp", "wd", "wn", "wdp", "wnp", "wnd", "wndp"]
    vocabulary = {}
    vocabulary_words = Counter()
    vocabulary_labels = Counter()
    vocabulary["wp"] = Counter()
    vocabulary["wd"] = Counter()
    vocabulary["wn"] = Counter()
    vocabulary["wnp"] = Counter()
    vocabulary["wnd"] = Counter()
    vocabulary["wdp"] = Counter()
    vocabulary["wndp"] = Counter()
    outfrag = {}
    for fileindex in filelist:
        if seenfile[fileindex]:
            zippy = mergefiles(filelist[fileindex][categories[0]],
                               filelist[fileindex][categories[1]],
                               filelist[fileindex][categories[2]],
                               filelist[fileindex][categories[3]])
            wp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wp", fileindex), "w+")
            wd_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wd", fileindex), "w+")
            wn_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wn", fileindex), "w+")
            wnp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wnp", fileindex), "w+")
            wnd_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wnd", fileindex), "w+")
            wdp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wdp", fileindex), "w+")
            wndp_f = open(
                '{}{}/new_{:0>4d}.txt'.format(outpath, "wndp", fileindex),
                "w+")
            for fragment in zippy:
                antal_frag += 1
                for cc in conditions:
                    outfrag[cc] = []
                for oneitem in fragment:
                    vocabulary_words.update([oneitem[0]])
                    vocabulary_labels.update([oneitem[1]])
                    vocabulary_labels.update([oneitem[2]])
                    vocabulary_labels.update([oneitem[3]])
                    vocabulary["wp"].update(
                        [joinstring.join([oneitem[0], oneitem[1]])])
                    outfrag["wp"].append("".join([oneitem[0], oneitem[1]]))
                    vocabulary["wd"].update(
                        [joinstring.join([oneitem[0], oneitem[2]])])
                    outfrag["wd"].append("".join([oneitem[0], oneitem[2]]))
                    vocabulary["wn"].update(
                        [joinstring.join([oneitem[0], oneitem[3]])])
                    outfrag["wn"].append("".join([oneitem[0], oneitem[3]]))
                    vocabulary["wnp"].update([
                        joinstring.join([oneitem[0], oneitem[1], oneitem[2]])
                    ])
                    outfrag["wnp"].append("".join(
                        [oneitem[0], oneitem[1], oneitem[2]]))
                    vocabulary["wnd"].update([
                        joinstring.join([oneitem[0], oneitem[1], oneitem[3]])
                    ])
                    outfrag["wnd"].append("".join(
                        [oneitem[0], oneitem[1], oneitem[3]]))
                    vocabulary["wdp"].update([
                        joinstring.join([oneitem[0], oneitem[2], oneitem[3]])
                    ])
                    outfrag["wdp"].append("".join(
                        [oneitem[0], oneitem[2], oneitem[3]]))
                    vocabulary["wndp"].update([
                        joinstring.join(
                            [oneitem[0], oneitem[1], oneitem[2], oneitem[3]])
                    ])
                    outfrag["wndp"].append("".join(
                        [oneitem[0], oneitem[1], oneitem[2], oneitem[3]]))
                wp_f.write(" ".join(outfrag["wp"]) + "\n")
                wd_f.write(" ".join(outfrag["wd"]) + "\n")
                wn_f.write(" ".join(outfrag["wn"]) + "\n")
                wnp_f.write(" ".join(outfrag["wnp"]) + "\n")
                wnd_f.write(" ".join(outfrag["wnd"]) + "\n")
                wdp_f.write(" ".join(outfrag["wdp"]) + "\n")
                wndp_f.write(" ".join(outfrag["wndp"]) + "\n")
            wn_f.close()
            wd_f.close()
            wp_f.close()
            wnd_f.close()
            wnp_f.close()
            wdp_f.close()
            wndp_f.close()

    logger("antal fragment: {}".format(antal_frag), monitor)
    vocab_words = {w for w, c in vocabulary_words.items() if c >= MINCOUNT}
    size_vocab = len(vocab_words)
    logger("antal ord std: {}".format(size_vocab), monitor)
    embeddings = {}
    for w in vocab_words:
        embeddings[w] = sparsevectors.newrandomvector(dimensionality, density)

    vocab_labels = {w for w, c in vocabulary_labels.items() if c >= MINCOUNT}
    size_vocab = len(vocab_labels)
    logger("antal tag tot: {}".format(size_vocab), monitor)
    labelembeddings = {}
    for w in vocab_labels:
        try:
            labelembeddings[w] = sparsevectors.newrandomvector(
                dimensionality, labeldensity)
        except IndexError:
            logger("Indexerror: {}".format(w), error)
    for cc in conditions:
        vocab_words = {w for w, c in vocabulary[cc].items() if c >= MINCOUNT}
        size_vocab = len(vocab_words)
        compositeembeddings = {}
        logger("antal ord i {}: {}".format(cc, size_vocab), monitor)
        with open('{}{}/vocab.words.txt'.format(outpath, cc), "w+") as f:
            for wdl in sorted(list(vocab_words)):
                wd = "".join(wdl.split(joinstring))
                f.write('{}\n'.format(wd))
                vv = embeddings[wdl.split(joinstring)[0]]
                for ll in wdl.split(joinstring)[1:]:
                    vv = sparsevectors.sparseadd(vv, labelembeddings[ll])
                compositeembeddings[wd] = sparsevectors.listify(
                    sparsevectors.normalise(vv), dimensionality)
        with open('{}{}/compositevectors.txt'.format(outpath, cc), "w+") as f:
            for www in compositeembeddings:
                f.write("{} {}\n".format(
                    www, " ".join(map(str, compositeembeddings[www]))))