Esempio n. 1
0
def vk2xmpp(id):
	"""
	Returns id@TransportID if parameter "id" is int or str(int)
	Returns id if parameter "id" is id@TransportID
	Returns TransportID if "id" is TransportID
	"""
	if not utils.isNumber(id) and "@" in id:
		id = id.split("@")[0]
		if utils.isNumber(id):
			id = int(id)
	elif id != TransportID:
		id = u"%s@%s" % (id, TransportID)
	return id
Esempio n. 2
0
    def __isCommand(self, key, word2):
        """ decide if a keyword and next word are of the form
          'command arg, ...'
        which will get translated to
          'command(arg, ...)'
        to allow 'command syntax'
        """
        # this could be in one long test, but we simplify:
        # first test key:
        if (not isValidName(key) or
            key in self.friends or
            key.startswith('#') or
            len(key) < 1 or len(word2) < 1):
            return False


        if self._larch is not None:
            comms = self._larch.symtable.get_symbol('_sys.valid_commands',
                                                    create=True)
            if key not in comms:
                return False

        # next test word2
        return (isValidName(word2) or isNumber(word2) or
                isLiteralStr(word2) )
Esempio n. 3
0
	def classify(self, data):
		classProbabilities = {}					# Stores our final probabilities
		for className in self.classifierBins:
			probabilityProd = None
			# Calculate product of our probabilities
			for key in data:
				if key != settings.CLASSIFIER_NAME:
					if util.isNumber(data[key]) == False:
						probKey = str(data[key]) + " given " + className
						if probabilityProd == None:
							probabilityProd = self.probability[probKey]
						else:
							probabilityProd *= self.probability[probKey]
					else:
						prob = util.gaussianDensity(data[key], self.numericBins[key + ' given ' + className + ' mean'], self.numericBins[key + ' given ' + className + ' stdev'])
						if probabilityProd == None:
							probabilityProd = prob
						else:
							probabilityProd *= prob
			classProbabilities[className] = probabilityProd * self.probability[className]
		maxProb = [0, None]
		for className in classProbabilities:
			if classProbabilities[className] > maxProb[0]:
				maxProb = [classProbabilities[className], className]
		return maxProb[1]
Esempio n. 4
0
 def classify(self, data):
     classProbabilities = {}  # Stores our final probabilities
     for className in self.classifierBins:
         probabilityProd = None
         # Calculate product of our probabilities
         for key in data:
             if key != settings.CLASSIFIER_NAME:
                 if util.isNumber(data[key]) == False:
                     probKey = str(data[key]) + " given " + className
                     if probabilityProd == None:
                         probabilityProd = self.probability[probKey]
                     else:
                         probabilityProd *= self.probability[probKey]
                 else:
                     prob = util.gaussianDensity(
                         data[key], self.numericBins[key + ' given ' +
                                                     className + ' mean'],
                         self.numericBins[key + ' given ' + className +
                                          ' stdev'])
                     if probabilityProd == None:
                         probabilityProd = prob
                     else:
                         probabilityProd *= prob
         classProbabilities[
             className] = probabilityProd * self.probability[className]
     maxProb = [0, None]
     for className in classProbabilities:
         if classProbabilities[className] > maxProb[0]:
             maxProb = [classProbabilities[className], className]
     return maxProb[1]
Esempio n. 5
0
def filter_keyword():
    fname = u'D:/Data/词检索/国家战略.txt'
    with open(fname, 'r') as f:
        files = [line.strip() for line in f.readlines()]
    for f in files:
        with open(f, 'r') as fp:
            text = fp.readlines()
        if f.startswith('D:/Data/gov'):
            text = ' '.join(text).decode('utf-8')
            # f = f.lstrip('D:/Data/gov/')
            # dep, fname = f.split('/')[0], f.split('/')[1]
            # f = '%s_%s_%s'%(fname.split('_')[0], dep, fname.split('_')[1])
            # print f
        if text.find(u'国家战略') != -1 and f.decode('gbk').find(u'习近平') != -1:
            print f
            # fname = u'D:/Data/词检索/国家战略/%s'%(f.decode('gbk'))
            # with open(fname, 'w') as fp:
            # fp.write(text.encode('utf-8'))
            doc = jieba.cut(text)
            doc = [
                word for word in doc
                if (not word in stop_words) and (not utils.isNumber(word))
            ]
            doc = ' '.join(doc)
            doc = doc.replace('\t', ' ').replace('\n', ' ')
            with open(text_file, 'a') as fp:
                fp.write(doc.encode('utf-8') + '\n')
Esempio n. 6
0
def get_topic_heat(topic_number, look_back=7):
    fname = '%s/topic_words/topic_%s_twords.xlsx' % (const.TOPIC_DIR,
                                                     topic_number)
    df = pd.read_excel(fname)
    heat_count = {}
    for word in df['word']:
        if not utils.isNumber(word) and word in word_count:
            for day, value in word_count[word].iteritems():
                if not heat_count.has_key(day):
                    heat_count[day] = 0
                heat_count[day] += value
    df = pd.DataFrame({
        'date': total_word_count.keys(),
        'count': total_word_count.values()
    })
    df.index = pd.to_datetime(df['date'], format="%Y-%m-%d")
    df.sort_index(inplace=True)
    heat_df = pd.DataFrame({
        "date": heat_count.keys(),
        "absolute": heat_count.values()
    })
    heat_df.index = heat_df["date"].map(
        lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
    heat_df.sort_index(inplace=True)
    heat_df['total'] = df['count']
    # 滚动k日
    heat_df.loc[:, 'total'] = heat_df['total'].rolling(window=look_back).sum()
    heat_df.loc[:, 'absolute'] = heat_df['absolute'].rolling(
        window=look_back).sum()
    heat_df['relative'] = heat_df['absolute'] * 100. / heat_df['total']
    heat_df.to_excel("%s/%s.xlsx" % (const.TOPIC_CLASS_DIR, topic_number),
                     index=False)
Esempio n. 7
0
def update_files(prefix, files):
    with open(const.WORD_CNT_FILE, 'rb') as fp:
        word_count = pickle.load(fp)
    with open(const.WORD_CNT_CHECKED_FILE, 'r') as fp:
        checked_list = set([line.strip().decode('utf-8') for line in fp.readlines()])
    f_check = open(const.WORD_CNT_CHECKED_FILE, 'a')
    for f in files:
        date = f.split('_')[0]
        if not date.startswith('2016') and not date.startswith('2017'):
            continue
        date = date.split(' ')[0]
        f = '%s/%s'%(prefix, f)
        if f in checked_list or not f.endswith('.txt'):
            continue
        f_check.write(f.encode('utf-8') + '\n')
        with open(f, 'r') as fp:
            text = fp.readlines()

        content = ' '.join(text)
        doc = [word for word in jieba.cut(content) if not word in stop_words]
        for word in doc:
            if utils.isNumber(word):
                continue
            if word_count.has_key(word):
                continue
            if not word_count.has_key(word):
                word_count[word] = {}
            if not word_count[word].has_key(date):
                word_count[word][date] = 0
            word_count[word][date] += 1
    with open(const.WORD_CNT_FILE, 'wb') as fp:
        pickle.dump(word_count, fp)
Esempio n. 8
0
def get_wallst_text():
    document = ''
    for y in years:
        files = [
            "%s/%s/%s" % (WALLSTCN_DIR, y, f)
            for f in os.listdir("%s/%s/" % (WALLSTCN_DIR, y))
        ]
        print y, len(files)
        if len(files) > 0:
            for f in files:
                with open(f, 'r') as fp:
                    text = fp.readlines()
                time = text[0].split('_')[0]
                time = time.strip()
                if time.find('-') != -1:
                    dt = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M")
                else:
                    dt = datetime.datetime.strptime(time, '%Y年%m月%d日 %H:%M:%S')
                date = dt.strftime("%Y-%m-%d")

                content = " ".join(text[1:])
                doc = [
                    word for word in jieba.cut(content)
                    if (not word in stop_words) and (not utils.isNumber(word))
                ]
                document += '\n' + ' '.join(doc)

    with open(TEXT_FILE, 'w') as fp:
        fp.write(document.encode('utf-8'))
Esempio n. 9
0
    def writeMember(self, obj, memberName):
        if isString(obj):
            logging.warning(u"String as object provided! " +
                            self._warningPrefix(obj, memberName))
        if isInteger(memberName) and isList(obj):
            member = obj[memberName]
            memberName = str(memberName)
        else:
            member = getattr(obj, memberName, None)
        if member is None:
            self.log(u"skipped " + self._warningPrefix(obj, memberName) +
                     u"It is empty or does not exist (=None).")
            return

        if isCallable(member) and not hasattr(member, "hdfWrite"):
            member = member()

        if hasattr(member, "hdfWrite"):  # support instances and types
            # store the member in a group of its own
            oldLocation = self.location
            self._location = "/".join((oldLocation.rstrip('/'), memberName))
            member.hdfWrite(self)  # recursion entry, mind the loops!
            self._location = oldLocation
        elif isList(member):
            self.writeDataset(memberName, member)
        elif isString(member) or isNumber(member):
            self.writeAttribute(memberName, member)
        else:
            self.log(u"skipped " + self._warningPrefix(obj, memberName) +
                     "(={}) It is not a compatible value type!".format(
                         classname(member)))
Esempio n. 10
0
 def removeTask(self, task):
     '''Remove task from subtasks.'''
     if not isNumber(task):
         for t in self.get('Subtasks'):
             if t.get('Task') == task:
                 task = t.get('ID')
                 break
     if not isNumber(task) or len(self.get('Subtasks')[int(task)].get('Subtasks')) > 0:
         if not confirm("Are you sure you wish to remove '"+task+"'?", self.win):
             return
     if isNumber(task):
         self['Subtasks'] = [t for t in self.get('Subtasks') if t.get('ID') != int(task)]
     elif task.lower() == "all" or task.lower() == "-a":
         self['Subtasks'] = []
     for i in range(len(self['Subtasks'])):
         self['Subtasks'][i]['ID'] = i
Esempio n. 11
0
def vk2xmpp(id):
    """
	Converts a numeric VK ID to a Jabber ID and vice versa
	Args:
		id: a Jabber or VK id
	Returns:
		id@TransportID if parameter id is a number
		id if parameter "id" is id@TransportID
		TransportID if the given id is equal to TransportID
	"""
    if not utils.isNumber(id) and "@" in id:
        id = id.split("@")[0]
        if utils.isNumber(id):
            id = int(id)
    elif id != TransportID:
        id = u"%s@%s" % (id, TransportID)
    return id
Esempio n. 12
0
def vk2xmpp(id):
	"""
	Converts a numeric VK ID to a Jabber ID and vice versa
	Args:
		id: a Jabber or VK id
	Returns:
		id@TransportID if parameter id is a number
		id if parameter "id" is id@TransportID
		TransportID if the given id is equal to TransportID
	"""
	if not utils.isNumber(id) and "@" in id:
		id = id.split("@")[0]
		if utils.isNumber(id):
			id = int(id)
	elif id != TransportID:
		id = u"%s@%s" % (id, TransportID)
	return id
Esempio n. 13
0
 def setValue(selforcls, newValue, clip=True):
     if newValue is None:
         return  # ignore
     testfor(isNumber(newValue), DefaultValueError,
             u"A value has to be numerical! ({})".format(newValue))
     if clip:
         # clip to min/max values:
         newValue = selforcls.clip(newValue)
     super(ParameterNumerical, selforcls).setValue(newValue)
Esempio n. 14
0
 def setDecimals(selforcls, newDecimals):
     if newDecimals is not None:
         testfor(
             isNumber(newDecimals) and newDecimals >= 0, DecimalsError,
             "Parameter decimals has to be a positive number!")
     else:
         start, end = selforcls._valueRange
         newDecimals = round(math_log10(math_fabs(end - start)))
     newDecimals = max(newDecimals, 0)
     newDecimals = min(newDecimals, sys.float_info.max_10_exp)
     selforcls._decimals = int(newDecimals)
Esempio n. 15
0
 def setDisplayValues(selforcls, newDisplayValues):
     if newDisplayValues is None:
         return
     testfor(isMap(newDisplayValues), DisplayValuesError,
             "Expected a display value mapping of numbers to text!")
     testfor(all([isNumber(v) for v in newDisplayValues.keys()]),
             DisplayValuesError, "Display value keys have to be numbers!")
     testfor(all([isString(s) for s in newDisplayValues.values()]),
             DisplayValuesError, "Display values have to be text!")
     # TODO: also add reverse lookup
     selforcls._displayValues = newDisplayValues
Esempio n. 16
0
    def getExploded(self, namebase, atomsMass):
        import itertools
        if "p_" in self.name:
            namebase = "p_" + namebase
        if "o_" in self.name:
            namebase = "o_" + namebase
        if "i_" in self.name:
            namebase = "i_" + namebase
        if "c_" in self.name:
            namebase = "c_" + namebase
        if "l_" in self.name:
            namebase = "l_" + namebase
        if "m_" in self.name:
            namebase = "m_" + namebase
        # copy namebase to replace after
        specName = namebase + ""
        # store keys sorted by inverse length
        atoms = sorted(list(atomsMass.keys()), key=lambda xx: len(xx), reverse=True)
        # produce unique character combinations
        alpha = ["".join(x) for x in list(itertools.product("XYZ", repeat=4))]
        # check to have enough combinations
        if len(atoms) > len(alpha):
            sys.exit("ERROR: in species parser alpha needs to be extended!")

        # replace atoms slash-separated
        for i in range(len(atoms)):
            specName = specName.replace(atoms[i], "/" + alpha[i] + "/")
        # replace double slashes
        while "//" in specName:
            specName = specName.replace("//", "/")
        # split at slashes
        aspec = [x for x in specName.split("/") if x != ""]

        # search for number and when found multiply previous non-number
        exploded = []
        for a in aspec:
            if isNumber(a):
                for j in range(int(a) - 1):
                    exploded.append(aold)
            else:
                exploded.append(a)
            aold = a

        # store exploded with real atom names
        try:
            exploded = [atoms[alpha.index(x)] for x in exploded]
        except:
            print("ERROR: wanted to parse ", namebase)
            print(" but something went wrong with ", exploded)
            print(" Available atoms are:", atoms)
            print(" Add to atom list file if needed.")
            sys.exit()

        return sorted(exploded)
Esempio n. 17
0
def convDAT(lines, savename):
    """ Convert .DAT diffraction file format in a standard text file

    :param lines: List of the text lines containing the data
    :param savename: Save the converted data to the file 'savename'
    :return: None if success otherwise return an error message.
    """
    # Remove empty lines
    newlines = [lin for lin in lines if lin]
    # Remove comment lines (starting with #)
    lines = [lin for lin in newlines if not lin.startswith('#')]
    # Remove multiple spaces
    newlines = [" ".join(lin.split()) for lin in lines]
    # The first line should contain 2theta start, step and final value
    params = newlines[0].split()
    errmsg = "Unknown file format"
    if isNumber(params[0]):
        start = float(params[0])
        if isNumber(params[1]):
            step = float(params[1])
            if isNumber(params[2]):
                stop = float(params[2])
                if stop > start:
                    errmsg = None
    if errmsg is not None:
        return errmsg

    Ilst = [lin.split() for lin in newlines[1:]]
    # Flatten the list of list
    fIlst = [item for sublist in Ilst for item in sublist]
    lines = []
    for j, I in enumerate(fIlst):
        x = start + j * step
        lin = "{0}\t{1}".format(x, I)
        lines.append(lin)

    # Save the new list in a file
    outfile = open(savename, 'w')
    outfile.write("\n".join(lines))
    return errmsg
Esempio n. 18
0
 def filterNodes(self, x=None, y=None, z=None, t=None):
     l = self.nodes
     if x is not None:
         l = [a for a in l if a.x == x]
     if y is not None:
         l = [a for a in l if a.y == y]
     else:  # default --> only full states
         l = [a for a in l if isNumber(a.y)]
     if z is not None:
         l = [a for a in l if a.z == z]
     if t is not None:
         l = [a for a in l if a.t == t]
     return l
Esempio n. 19
0
    def addNode(self, node):
        if node.name in self.nodeDict:
            raise Exception(f"node {node.name} already in graph")
        self.nodes.append(node)
        self.nodeDict[node.name] = node

        t = node.t
        y = node.y
        if isNumber(y):
            l = len(self.nodesFiltT)
            if (t >= l):
                self.nodesFiltT += [[] * (t + 1 - l)]
            self.nodesFiltT[t] += [node]
Esempio n. 20
0
def update_wallst_word_count():
    # with open(WORD_CNT_FILE, 'r') as fp:
        # word_count = json.load(fp)
    # with open(WORD_CNT_CHECKED_FILE, 'r') as fp:
        # checked_list = [line.strip() for line in fp.readlines()]
    with open(const.WORD_CNT_FILE, 'rb') as fp:
        word_count = pickle.load(fp)
    with open(const.WORD_CNT_CHECKED_FILE, 'r') as fp:
        checked_list = set([line.strip().decode('utf-8') for line in fp.readlines()])
    f_check = open(const.WORD_CNT_CHECKED_FILE, 'a')
    for y in years:
        files = ["%s/%s/%s"%(WALLSTCN_DIR, y, f) for f in os.listdir("%s/%s/"%(WALLSTCN_DIR, y))]
        print y, len(files)
        if len(files) > 0:
            for f in files:
                if f in checked_list:
                    continue
                # with open(WORD_CNT_CHECKED_FILE, 'a') as fp:
                    # fp.write(f + '\n')
                f_check.write(f.encode('utf-8') + '\n')
                with open(f, 'r') as fp:
                    text = fp.readlines()
                time = text[0].decode('utf-8').split('_')[0]
                time = time.strip()
                if time.find('-') == -1:
                    time = time.replace(u'年', '-').replace(u'月', '-').replace(u'日', '')
                '''
                if time.count(':') == 2:
                    dt = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
                else:
                    dt = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M')
                date = dt.strftime("%Y-%m-%d")
                '''
                date = time.split(' ')[0]

                content = " ".join(text[1:])
                doc = [word for word in jieba.cut(content) if not word in stop_words]
                for word in doc:
                    if utils.isNumber(word):
                        continue
                    if not word_count.has_key(word):
                        word_count[word] = {}
                    if not word_count[word].has_key(date):
                        word_count[word][date] = 0
                    word_count[word][date] += 1

    # with open(WORD_CNT_FILE, 'w') as fp:
        # json.dump(word_count, fp)
    with open(const.WORD_CNT_FILE, 'wb') as fp:
        pickle.dump(word_count, fp)
Esempio n. 21
0
 def disAssembly(self, md, inSymbols, code, address, outputPath):
     try:
         for (address, size, mnemonic,
              op_str) in md.disasm_lite(code, address):
             with open(outputPath + "/" + "assembly", "a+") as f:
                 if utils.isNumber(op_str.lstrip("#")):
                     op_fun = inSymbols.get(op_str.lstrip("#"))
                     if op_fun is not None:
                         op_str = "[" + op_str.lstrip(
                             "#") + "->" + op_fun + "]"
                 f.write("0x%x:\t%s\t%s" % (address, mnemonic, op_str))
                 f.write("\n")
     except CsError as e:
         print("ERROR: %s" % e)
Esempio n. 22
0
 def plot(self, stats):
     xvec = stats["seriesKey"]
     if not all((isNumber(x) for x in xvec)):
         xvecNew = range(len(xvec))
         self._axes.set_xticks(xvecNew)
         self._axes.set_xticklabels(xvec, rotation=15)
         xvec = xvecNew
     self._axes.errorbar(xvec,
                         stats["mean"],
                         stats["meanStd"],
                         label=stats["cfg"])
     self._axes.set_xlabel(stats["seriesKeyName"])
     self._axes.set_ylabel("mean")
     self._axes.set_title(stats["title"])
Esempio n. 23
0
 def editTask(self, task, cbox):
     '''Edit the task name.'''
     if isNumber(task):
         if int(task) < len(self.get('Subtasks')) and int(task) >= 0:
             t = self.get('Subtasks')[int(task)]
     else:
         for item in self.get('Subtasks'):
             if item.get('Task') == task:
                 t = item
     cbox.text = t.get('Task')
     cbox.bl = 0
     cbox.br = len(cbox.text)
     while True:
         self.win.erase()
         cbox.draw()
Esempio n. 24
0
    def __isCommand(self, key, word2):
        """ decide if a keyword and next word are of the form
          'command arg, ...'
        which will get translated to
          'command(arg, ...)'
        to allow 'command syntax'
        """
        # this could be in one long test, but we simplify:
        # first test key:
        if (not isValidName(key) or key in self.friends or key.startswith('#')
                or len(key) < 1 or len(word2) < 1):
            return False

        # next test word2
        return (isValidName(word2) or isNumber(word2) or isLiteralStr(word2))
Esempio n. 25
0
 def setValueRange(selforcls, newRange):
     testfor(isList(newRange), ValueRangeError,
             "A value range is mandatory for a numerical parameter!")
     testfor(
         len(newRange) == 2, ValueRangeError,
         "A value range has to consist of two values!")
     testfor(all([isNumber(v) for v in newRange]), ValueRangeError,
             "A value range has to consist of numbers only!")
     minVal, maxVal = min(newRange), max(newRange)
     # minVal = max(minVal, -sys.float_info.max)
     # maxVal = min(maxVal,  sys.float_info.max)
     # avoid inf/nan showing up somewhere
     # otherwise, inf might be ok if UI elements support it (going in&out)
     minVal = max(minVal, -1e200)  # as good as -inf?...
     maxVal = min(maxVal, 1e200)  # as good as inf?...
     selforcls._valueRange = minVal, maxVal
     # apply limits to value:
     selforcls.setValue(selforcls.clip())
Esempio n. 26
0
    def slotItemChanged(self, item):
        """ Callback function when the user changes a cell content.

        :param row:
        :param col:
        :return: nothing
        """
        if not self.init:
            text = item.text()
            if isNumber(text):
                oldval = self.data[item.column()][item.row()]
                newval = float(text)
                if newval != oldval:
                    self.data[item.column()][item.row()] = newval
                    self.pltw.dirty = True
                    self.pltw.updatePlot()
                    if self.pltw.dcursor is not None:
                        self.pltw.dcursor.updateLinePos()
Esempio n. 27
0
def out_to_csv(in_filename, out_filename):
    data = []
    i = -1
    with open(in_filename) as in_f:
        lines = in_f.readlines()
    for line in lines:
        if not isNumber(line):
            i += 1
            data.append([])
        else:
            data[i].append(float(line))
    with open(out_filename, 'w') as out_f:
        for i in range(len(data)):
            for j in range(len(data[i])):
                if j == len(data[i]) - 1:
                    out_f.write("{}\n".format(data[i][j]))
                else:
                    out_f.write("{},".format(data[i][j]))
Esempio n. 28
0
    def __isCommand(self, key, word2):
        """ decide if a keyword and next word are of the form
          'command arg, ...'
        which will get translated to
          'command(arg, ...)'
        to allow 'command syntax'
        """
        # this could be in one long test, but we simplify:
        # first test key:
        if (not isValidName(key) or
            key in self.friends or
            key.startswith('#') or
            len(key) < 1 or len(word2) < 1):
            return False

        # next test word2
        return (isValidName(word2) or isNumber(word2) or
                isLiteralStr(word2) )
Esempio n. 29
0
def convPRF(lines, savename):
    """ Convert a PRF file generated by Fullprof in a standard text file.

        Save the converted data in the file 'filename'.

    :param lines: list of the text lines containing the PRF data
    :param savename: name of file to which converted data will be saved.
    :return: None if success otherwise return an error message.
    """

    err = 0
    if not any(" 2Theta" in lin for lin in lines) or len(lines) < 7:
        err = 1
    nl = len(lines)
    nitems = lines[1].split(' ')
    nitems = [x for x in nitems if x]
    if not isNumber(nitems[1]):
        err = 1
    else:
        npt = int(nitems[1])
        if npt > nl:
            err = 1
    if err:
        return "Unknown file format"

    startidx = [idx for idx, s in enumerate(lines) if " 2Theta" in s][0] + 1
    errmsg = None
    newlines = []
    # Add a # at the beginning of the first line
    newlines.append('#' + lines[0])
    newlines.extend(["# 2Theta\tYobs\tYcal\tYobs_Ycal"])

    # For each line keeps only the 4 first items
    for l, line in enumerate(lines):
        if l >= startidx and l < npt + startidx:
            items = line.split('\t')
            line = [items[i] for i in range(4)]
            newlines.append("\t".join(line))

    # Save the new list in a file
    outfile = open(savename, 'w')
    outfile.write("\n".join(newlines))
    return errmsg
Esempio n. 30
0
def load_sentences(path, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    with open(path, 'r') as f:
        # for line in codecs.open(path, 'r', 'utf8'):
        for line in f.readlines():
            line = isNumber(line.rstrip()) if zeros else line.rstrip()
            if not line:
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
            else:
                word = line.split('\t')
                assert len(word) >= 2
                sentence.append([word[0], word[1], word[2]])
    if len(sentence) > 0:
        sentences.append(sentence)
    return sentences
Esempio n. 31
0
def extract_text(path):
    files = [f for f in os.listdir(path)]
    content = []
    for f in files:
        # print f
        fname = u'%s/%s' % (path, f)
        with open(fname, 'r') as fp:
            text = fp.readlines()
        text = ' '.join(text).decode('utf-8')
        doc = jieba.cut(text)
        doc = [
            word for word in doc
            if (not word in stop_words) and (not utils.isNumber(word))
        ]
        # print(len(doc))
        doc = ' '.join(doc)
        doc = doc.replace('\t', ' ').replace('\n', ' ')
        content.append(doc)
        # for word in doc:
        # print word
        # print ' '.join(doc).encode('gbk')
        # break
    with open(text_file, 'w') as f:
        f.write('\n'.join(content).encode('utf-8'))
Esempio n. 32
0
    def getNameLatex(self, name):

        # electrons are special
        if name == "E_gas":
            return "e$^-$"
            
        latexName = ""
        # loop on name characters
        for char in list(name):
            if char in ["+", "-"]:
                # signs are superscripts
                latexName += "$^" + char + "$"
            elif isNumber(char):
                # numbers are subscripts
                latexName += "$_" + char + "$"
            else:
                # standard characters
                latexName += char

        # replace _gas with dedicated latex command
        latexName = latexName.replace("_gas", "$\\gas$")

        # return latex name
        return latexName
Esempio n. 33
0
def save_wallst_text():
    years = [
        '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'
    ]
    with open(const.WST_WORD_CNT_CHECKED_FILE, 'r') as fp:
        checked_list = ([line.strip() for line in fp.readlines()])

    for y in years:
        files = [f for f in os.listdir('%s/%s' % (const.WALLSTCN_DIR, y))]
        print y, len(files)
        for f in files:
            fname = '%s/%s/%s' % (const.WALLSTCN_DIR, y, f)
            if fname in checked_list:
                with open(fname, 'r') as fp:
                    content = ' '.join(
                        [line.strip() for line in fp.readlines()[1:]])
                content = jieba.cut(content)
                content = [
                    w for w in content
                    if (w not in stop_words) and (not utils.isNumber(w))
                ]
                content = ' '.join(content).encode('utf-8')
                with open(const.WST_TOPIC_FILE, 'a') as fp:
                    fp.write(content + '\n')
Esempio n. 34
0
def get_word_heat(key_word,
                  threshold=0.5,
                  similar_words=1000,
                  start_date="2016-01-01",
                  end_date=datetime.datetime.today().strftime("%Y-%m-%d"),
                  look_back=7):

    print(key_word)
    heat_count = word_count[key_word].copy()
    heat_count_relative = word_count[key_word].copy()
    heat_count_weighted = word_count[key_word].copy()
    similar_df = pd.DataFrame({"word": [key_word], "distance": [1.0]})
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.datetime.today()
    for word, dis in model.most_similar(key_word, topn=similar_words):
        if dis < threshold:
            break
        if not utils.isNumber(word) and word in word_count:
            similar_df.ix[similar_df.shape[0]] = [dis, word]
            for day, value in word_count[word].iteritems():
                if not heat_count.has_key(day):
                    heat_count[day], heat_count_relative[
                        day], heat_count_weighted[day] = 0, 0, 0
                heat_count[day] += value
                heat_count_relative[day] += value
                heat_count_weighted[day] += value * dis

    # 填充当日没有出现词的数据
    current_date = start_date
    while current_date < end_date:
        key = current_date.strftime("%Y-%m-%d")
        if not heat_count.has_key(key):
            heat_count[key], heat_count_relative[key], heat_count_weighted[
                key] = 0, 0, 0
        if not total_word_count.has_key(key):
            total_word_count[key] = 0
        current_date = current_date + datetime.timedelta(1)
    df = pd.DataFrame({
        'date': total_word_count.keys(),
        'count': total_word_count.values()
    })
    df.index = pd.to_datetime(df['date'], format="%Y-%m-%d")
    df.sort_index(inplace=True)
    # df = df[df.index >= '2016-01-01']
    # print df.tail()

    heat_df = pd.DataFrame({
        "date": heat_count.keys(),
        "absolute": heat_count.values(),
        'relative': heat_count_relative.values(),
        'weighted': heat_count_weighted.values()
    })
    heat_df.index = heat_df["date"].map(
        lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
    heat_df.sort_index(inplace=True)
    heat_df['total'] = df['count']
    # print heat_df.tail()
    # 滚动k日
    heat_df.loc[:, 'total'] = heat_df['total'].rolling(window=look_back).sum()
    heat_df.loc[:, 'absolute'] = heat_df['absolute'].rolling(
        window=look_back).sum()
    heat_df.loc[:, 'relative'] = heat_df['relative'].rolling(
        window=look_back).sum() * 100. / heat_df['total']
    heat_df.loc[:, 'weighted'] = heat_df['weighted'].rolling(
        window=look_back).sum() * 100. / heat_df['total']

    similar_df.to_csv("%s/%s_%.1f_words.csv" %
                      (const.ASSET_CLASS_DIR, key_word, threshold),
                      index=False,
                      encoding="utf-8")
    heat_df.to_csv("%s/%s_%.1f.csv" %
                   (const.ASSET_CLASS_DIR, key_word, threshold),
                   index=False)
Esempio n. 35
0
def readArff(fileSrc):
	# main variables to be returned
	relation = ""									# relation		
	attributes = []									# attribute list
	rawData = []									# main data storage
	reverseLookup = {}								# store by value for reverse lookup
	continuousVariables = {}
	categoricalVariables = {}
	dataFile = codecs.open(fileSrc, 'rb', 'utf-8') 	# specify utf-8 encoding
	print "Reading file..."
	lines = dataFile.readlines() 					# read all lines
	if settings.PROGRESS_BAR == True:
		util.updateProgress(0)					# create a progress bar
	# test every line and extract its relevant information
	for idx, line in enumerate(lines):				# test each line
		if settings.PROGRESS_BAR == True:
			util.updateProgress(float(idx) / float(len(lines)))
		if line[0] == '%':							# ignore comments
			continue
		elif line[0] == '@':						# if is metadata
			if '@relation' in line:					# if relation
				arrayLine = line.split(" ")
				relation = arrayLine[1]
			elif "@attribute" in line:				# if attribute
				arrayLine = line.split(" ")
				attributes.append([arrayLine[1]])
				if "real" not in arrayLine[2]:		# if attribute is not real (is categorical)
					attrs = re.search('\{(.*?)\}', line).group()	# select text between brackets
					attrs = re.sub('[\{\}]', "", attrs)				# remove brackets
					newAttrs = attrs.split(", ")					
					options = []
					for attr in newAttrs:
						options.append(attr)
					attributes[len(attributes) - 1].append(options)
				else: 							# if it is real
					attributes[len(attributes) - 1].append('real')
		elif line[0] == " ":
				continue
		else:
			line = line.replace(" ", "")
			line = line.replace("\n", "")
			line = line.split(",")
			newDataEntry = {}							# create a new object to store our row data
			for idx, value in enumerate(line):			# for every column of data
				attribute = attributes[idx]
				if util.isNumber(value):						# convert string to float if it's a number
					value = float(value)
				# Add value to our reverse lookup under the key "attributeName attributeValue"
				rlKey = attribute[0] + " " + str(value) 		# create key for our reverseLookup data structure
				if rlKey in reverseLookup:
					reverseLookup[rlKey].append(len(rawData)) # append index of our current row (the length of data) for quick lookup later
				else:
					reverseLookup[rlKey] = [len(rawData)]	# create a new arrayList to store our indices if one does not already exist
				# fill our newData Entry
				newDataEntry[attribute[0]] = value 		# store the value under its proper key
				# add variables to our bins
				if attribute[1] == 'real':  				# if the attribute is real, we place it in a continuous bin
					if attribute[0] in continuousVariables:
						continuousVariables[attribute[0]].add(value, line[len(line) - 1])							# add our value to our continuous bin
					else:
						continuousVariables[attribute[0]] = util.continuousBin(attribute[0])	# instantiate a continuous bin to hold our variable
						continuousVariables[attribute[0]].add(value, line[len(line) - 1])
				else:									# if the attribute is categorical, we place it in a categorical bin
					if attribute[0] in categoricalVariables:
						categoricalVariables[attribute[0]].add(value, line[len(line) - 1])
					else:
						categoricalVariables[attribute[0]] = util.categoricalBin(attribute[1])
						categoricalVariables[attribute[0]].add(value, line[len(line) - 1])
			rawData.append(newDataEntry)					# append data entry to all of our data
	# END OF FOR LOOP
	results = {}
	results['data'] = rawData
	results['attributes'] = attributes
	results['relation'] = relation
	results['lookup'] = reverseLookup
	results['continuousVariables'] = continuousVariables
	results['categoricalVariables'] = categoricalVariables
	if settings.PROGRESS_BAR == True:
		util.updateProgress(1)
	print "\nFile read complete \n"
	return results
Esempio n. 36
0
    def __init__(self, trainingData, attributes):
        print "Training Bayesian Classifier with " + str(
            len(trainingData)) + " data entries.\n"
        # COUNT VARIABLES
        print "Counting all variables:"
        if settings.PROGRESS_BAR == True:
            util.updateProgress(0)
        # Sort the training data into two bins based on classifier, meanwhile recording the counts for each variable
        numOfEntries = float(len(trainingData))
        categoricalCounts = {}  # Holds counts of each category
        self.classifierBins = {}  # Holds the data points for each classifier
        self.probability = {}
        self.numericBins = {}
        count = 0.0
        for entry in trainingData:  # for every data row...
            count += 1.0
            if settings.PROGRESS_BAR == True:
                util.updateProgress(count / (numOfEntries))
            for attr in entry:  # for each attribute...
                if util.isNumber(
                        entry[attr]) == False:  # for categorical attributes
                    if entry[
                            attr] in categoricalCounts:  # if we have already created a key for this
                        categoricalCounts[
                            entry[attr]] += 1.0  # increment the key
                    else:  # otherwise we create a new key and set it to 1
                        categoricalCounts[entry[attr]] = 1.0
                    if attr == settings.CLASSIFIER_NAME:  # if we are on the classifier, in this case "class"
                        if entry[
                                attr] in self.classifierBins:  # add the row to the classifier bins,
                            self.classifierBins[entry[attr]].append(entry)
                        else:
                            self.classifierBins[entry[attr]] = [entry]
                else:  # For Numeric Attributes
                    key = attr + ' given ' + entry[
                        settings.CLASSIFIER_NAME]  # declare a key
                    if key in self.numericBins:  # if the key is already in our numeric bins
                        bisect.insort(
                            self.numericBins[key], entry[attr]
                        )  # insert the numeric attribute in a sorted location
                    else:
                        self.numericBins[key] = [
                            entry[attr]
                        ]  # if it doesn't exist, create a list for it
        # DEAL WITH CONTINUOUS VARIABLES
        initialKeys = self.numericBins.keys()
        for key in initialKeys:
            self.numericBins[key + " mean"] = np.mean(
                self.numericBins[key])  # store mean of each prob
            self.numericBins[key + " stdev"] = np.std(
                self.numericBins[key]
            )  # store std deviation of each continuous var
        for attr in attributes:  # if we have not stored values for certain attributes, we do so now, using smoothing techniques
            if attr[1] != 'real':
                for attrType in attr[1]:
                    if attrType not in self.probability:
                        self.probability[attrType] = .5 / numOfEntries
                        for name in self.classifierBins:
                            self.probability[attrType + " given " +
                                             name] = .5 / len(
                                                 self.classifierBins[name])

        # ASSIGN PROBABILITIES
        print "\n\nAssigning probabilities:"
        # Now we have two bins, each holding our different classifiers and counts of all our variables
        if settings.PROGRESS_BAR == True:
            util.updateProgress(0)
        for key in categoricalCounts.keys():  # Assign categorical counts
            self.probability[key] = self.getProbability(
                categoricalCounts[key], numOfEntries)
        attrs = categoricalCounts.keys(
        )  # get the attrs we will iterate through
        count = 0.0  # create a count used to log to the status bar
        for key in self.classifierBins.keys():  # for each classifier type...
            count += 1
            if settings.PROGRESS_BAR == True:
                util.updateProgress(count / float(
                    len(self.classifierBins.keys())))  # update progress bar

            for row in self.classifierBins[
                    key]:  # for each row in the classifierBins...
                for rowKey in row:  # for each key in the row...
                    if util.isNumber(
                            row[rowKey]
                    ) == False:  # if we're dealing with a categorical variable...
                        newKey = row[
                            rowKey] + " given " + key  # create a key variable
                        if newKey in categoricalCounts:  # count number of items included in that section
                            categoricalCounts[newKey] += 1.0
                        else:
                            categoricalCounts[newKey] = 1.0
            for attrValue in attrs:  # for every attrValue...
                countKey = attrValue + " given " + key  # create a key
                if countKey in categoricalCounts:  # add to categoricalCounts our conditional probabilities
                    self.probability[countKey] = self.getProbability(
                        categoricalCounts[countKey],
                        len(self.classifierBins[key])
                    )  # Assign conditional probabilities
                else:
                    self.probability[countKey] = self.getProbability(
                        0, len(self.classifierBins[key]))
        if settings.PROGRESS_BAR == True:
            util.updateProgress(1)
        print "\nModel creation complete\n"
Esempio n. 37
0
	def __init__(self, trainingData, attributes):
		print "Training Bayesian Classifier with " + str(len(trainingData)) + " data entries.\n"
		# COUNT VARIABLES
		print "Counting all variables:"
		if settings.PROGRESS_BAR == True:
			util.updateProgress(0)
		# Sort the training data into two bins based on classifier, meanwhile recording the counts for each variable
		numOfEntries = float(len(trainingData))
		categoricalCounts = {}		# Holds counts of each category
		self.classifierBins = {}	# Holds the data points for each classifier
		self.probability = {}
		self.numericBins = {}
		count = 0.0
		for entry in trainingData:	# for every data row...
			count += 1.0
			if settings.PROGRESS_BAR == True:
				util.updateProgress(count / (numOfEntries))
			for attr in entry:		# for each attribute...
				if util.isNumber(entry[attr]) == False:			# for categorical attributes
					if entry[attr] in categoricalCounts:		# if we have already created a key for this
						categoricalCounts[entry[attr]] += 1.0	# increment the key
					else:										# otherwise we create a new key and set it to 1
						categoricalCounts[entry[attr]] = 1.0
					if attr == settings.CLASSIFIER_NAME:		# if we are on the classifier, in this case "class"
						if entry[attr] in self.classifierBins:	# add the row to the classifier bins,
							self.classifierBins[entry[attr]].append(entry)
						else:
							self.classifierBins[entry[attr]] = [entry]
				else:															# For Numeric Attributes
					key = attr + ' given ' + entry[settings.CLASSIFIER_NAME]  	# declare a key 
					if key in self.numericBins:									# if the key is already in our numeric bins
						bisect.insort(self.numericBins[key], entry[attr])		# insert the numeric attribute in a sorted location
					else:
						self.numericBins[key] = [entry[attr]]					# if it doesn't exist, create a list for it
		# DEAL WITH CONTINUOUS VARIABLES
		initialKeys = self.numericBins.keys()
		for key in initialKeys:
			self.numericBins[key + " mean"] = np.mean(self.numericBins[key])	# store mean of each prob
			self.numericBins[key + " stdev"] = np.std(self.numericBins[key])	# store std deviation of each continuous var
		for attr in attributes:									# if we have not stored values for certain attributes, we do so now, using smoothing techniques
			if attr[1] != 'real':
				for attrType in attr[1]:
					if attrType not in self.probability:
						self.probability[attrType] = .5 / numOfEntries
						for name in self.classifierBins:
							self.probability[attrType + " given " + name] = .5 / len(self.classifierBins[name])



		# ASSIGN PROBABILITIES
		print "\n\nAssigning probabilities:"
		# Now we have two bins, each holding our different classifiers and counts of all our variables
		if settings.PROGRESS_BAR == True:
			util.updateProgress(0)
		for key in categoricalCounts.keys(): 							# Assign categorical counts
			self.probability[key] = self.getProbability(categoricalCounts[key], numOfEntries)
		attrs = categoricalCounts.keys()			# get the attrs we will iterate through
		count = 0.0									# create a count used to log to the status bar
		for key in self.classifierBins.keys():		# for each classifier type...
			count += 1
			if settings.PROGRESS_BAR == True:
				util.updateProgress(count / float(len(self.classifierBins.keys()))) # update progress bar
			
			for row in self.classifierBins[key]:			# for each row in the classifierBins...
				for rowKey in row:							# for each key in the row...
					if util.isNumber(row[rowKey]) == False:	# if we're dealing with a categorical variable...
						newKey = row[rowKey] + " given " + key  # create a key variable
						if newKey in categoricalCounts:			# count number of items included in that section
							categoricalCounts[newKey] += 1.0
						else:
							categoricalCounts[newKey] = 1.0
			for attrValue in attrs:								# for every attrValue...
				countKey = attrValue + " given " + key 			# create a key
				if countKey in categoricalCounts:				# add to categoricalCounts our conditional probabilities
					self.probability[countKey] = self.getProbability(categoricalCounts[countKey], len(self.classifierBins[key])) 	# Assign conditional probabilities
				else:
					self.probability[countKey] = self.getProbability(0, len(self.classifierBins[key]))
		if settings.PROGRESS_BAR == True:
			util.updateProgress(1)
		print "\nModel creation complete\n"
Esempio n. 38
0
 def setStepping(selforcls, newStepping):
     if newStepping is None:
         return
     testfor(isNumber(newStepping), SteppingError,
             "Parameter has to be a number!")
     selforcls._stepping = newStepping
Esempio n. 39
0
 def isDataType(cls, value):
     """ParameterNumerical is a fallback for all number not being float."""
     return isNumber(value) and not isinstance(value, float)
def isDigit(docTerm):
    termCopy = re.sub(",", "", docTerm)
    if util.isNumber(termCopy):
        return True
    return False