Beispiel #1
0
def get_matrix_rstr(mode, dic_json, desc_arg=None):
    texts = [infos["text_line"] for x, infos in dic_json.items()]
    lenmax = mode.split('_')[-1].split(',')[0]
    supportmax = mode.split('_')[-1].split(',')[1]
    rstr = Rstr_max()
    X = []
    for s in texts:
        rstr.add_str(s)
        X.append({})
    r = rstr.go()

    cpt_str = 0
    desc = []
    for (offset_end, nb), (l, start_plage) in r.items():
        ss = rstr.global_suffix[offset_end - l:offset_end]
        list_occur = []
        for o in range(start_plage, start_plage + nb):
            id_text = rstr.idxString[rstr.res[o]]
            list_occur.append(id_text)
        set_occur = set(list_occur)
        # Ici, il y a un souci dans les dimensions puisque tous les descripteurs ne sont pas
        # forcément présents. Donc dans certains cas on n'a rien pour une instance donnée.
        if desc_arg is not None and ss not in desc_arg:  # Test
            continue
        if len(set_occur) > 1:
            if len(ss) < int(lenmax) and len(
                    set_occur) < float(supportmax) * len(texts):
                for id_text in list_occur:
                    X[id_text].setdefault(cpt_str, 0)
                    X[id_text][cpt_str] += 1
                if desc_arg is None:  # corpus train = on ajoute le descripteur
                    desc.append(ss)
                cpt_str += 1

    # Ajout d'une instance virtuelle pour garantir l'homogénéite dans les dimensions des matrices de train et test
    if desc_arg is None:  # Train
        descriptors = desc
    else:  # test
        descriptors = desc_arg
    dic = {}
    for d in descriptors:
        dic[descriptors.index(d)] = 1
    X.append(dic)

    if mode.split('_')[1] == 'rel':
        X = relative_transformation(X)

    return desc, X
 def setUp(self):
   self.list_s = self.getString()
   self.rstr = Rstr_max()
   for s in self.list_s :
     self.rstr.add_str(s)
class Test_rstrmax:
  def setUp(self):
    self.list_s = self.getString()
    self.rstr = Rstr_max()
    for s in self.list_s :
      self.rstr.add_str(s)

  def test_rstr_max(self) :
    r = self.rstr.go()
    for (offset_end, nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.global_suffix[offset_end-l:offset_end]
#      ss = self.rstr.array_str[idStr][end-l:end]
      offset_end -= 1
      id_chaine = self.rstr.idxString[offset_end]
      s = self.rstr.global_suffix
      idx = 0
      for i in xrange(nb):
        idx = s.index(ss, idx) + 1
#      except ValueError, e:
#        print "+++", ss, end, i, nb
#      try:
#      self.assertRaises(ValueError, s.index, ss, idx)
#        print "***", ss, end, i, nb
#      except ValueError, e:
#        pass

  def test_maximality(self) :
    r = self.rstr.go()
    for (offset_end, nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.global_suffix[offset_end-l:offset_end]
      offset_end -= 1
      id_chaine = self.rstr.idxString[offset_end]
      s = self.rstr.array_str[id_chaine]

      set_left, set_right = set(), set()

      for o in range(start_plage, start_plage + nb) :
        offset_global = self.rstr.res[o]
        su = (self.rstr.idxPos[offset_global],self.rstr.idxString[offset_global])
        ls = len(self.rstr.array_str[su[1]])

        char_left = "START_STR%i"%(su[1]) if(su[0] == 0) else self.rstr.array_str[su[1]][su[0]-1]
        set_left.add(char_left)

        char_right = "END_STR%i"%(su[1]) if(su[0]+l == ls) else self.rstr.array_str[su[1]][su[0]+l]
        set_right.add(char_right)

      self.assertNotEqual(len(set_left), 1)
      self.assertNotEqual(len(set_right), 1)

  def utest_left_maximality(self) :
    r = self.rstr.go()
#    for (idStr, end, nb), (l, start_plage) in r.iteritems():
    for (offset_end, nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.global_suffix[offset_end-l:offset_end]
#      ss = self.rstr.array_str[idStr][end-l:end]
      offset_end -= 1
      id_chaine = self.rstr.idxString[offset_end]
      s = self.rstr.array_str[id_chaine]
#      s = self.rstr.array_str[idStr]
      set_left_char = set()
      for o in range(start_plage, start_plage + nb) :
        offset_global = self.rstr.res[o]
        su = (self.rstr.idxPos[offset_global],self.rstr.idxString[offset_global])
#        su = self.rstr.array_suffix[o]
        if(su[0] == 0) :
          char_left = "START_STR"
        else :
          char_left = self.rstr.array_str[su[1]][su[0]-1]
        set_left_char.add((char_left,su[1]))
      if(len(set_left_char) == 1) :
        print
        print '*'*10
        print set_left_char
        print ss.encode('utf-8')
        print '*'*10
        print
      self.assertNotEqual(len(set_left_char), 1)


  def utest_right_maximality(self) :
    r = self.rstr.go()
    for (offset_end, nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.global_suffix[offset_end-l:offset_end]
      offset_end -= 1
      id_chaine = self.rstr.idxString[offset_end]
      s = self.rstr.array_str[id_chaine]
      set_right_char = set()
      for o in range(start_plage, start_plage + nb) :
        offset_global = self.rstr.res[o]
        su = (self.rstr.idxPos[offset_global],self.rstr.idxString[offset_global])
        ls = len(self.rstr.array_str[su[1]])
        if(su[0]+l == ls) :
          char_right = "END_STR"
        else :
          char_right = self.rstr.array_str[su[1]][su[0]+l]
        set_right_char.add((char_right,su[1]))

      if(len(set_right_char) == 1) :
        print
        print '*'*10
        print set_right_char
        print ss.encode('utf-8')
        print '*'*10
        print
      self.assertNotEqual(len(set_right_char), 1)
Beispiel #4
0
 def setUp(self):
     self.list_s = self.getString()
     self.rstr = Rstr_max()
     for s in self.list_s:
         self.rstr.add_str(s)
Beispiel #5
0
class Test_rstrmax:
    def setUp(self):
        self.list_s = self.getString()
        self.rstr = Rstr_max()
        for s in self.list_s:
            self.rstr.add_str(s)

    def test_rstr_max(self):
        r = self.rstr.go()
        for (offset_end, nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.global_suffix[offset_end - l:offset_end]
            #      ss = self.rstr.array_str[idStr][end-l:end]
            offset_end -= 1
            id_chaine = self.rstr.idxString[offset_end]
            s = self.rstr.global_suffix
            idx = 0
            for i in xrange(nb):
                idx = s.index(ss, idx) + 1


#      except ValueError, e:
#        print "+++", ss, end, i, nb
#      try:
#      self.assertRaises(ValueError, s.index, ss, idx)
#        print "***", ss, end, i, nb
#      except ValueError, e:
#        pass

    def test_maximality(self):
        r = self.rstr.go()
        for (offset_end, nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.global_suffix[offset_end - l:offset_end]
            offset_end -= 1
            id_chaine = self.rstr.idxString[offset_end]
            s = self.rstr.array_str[id_chaine]

            set_left, set_right = set(), set()

            for o in range(start_plage, start_plage + nb):
                offset_global = self.rstr.res[o]
                su = (self.rstr.idxPos[offset_global],
                      self.rstr.idxString[offset_global])
                ls = len(self.rstr.array_str[su[1]])

                char_left = "START_STR%i" % (su[1]) if (
                    su[0] == 0) else self.rstr.array_str[su[1]][su[0] - 1]
                set_left.add(char_left)

                char_right = "END_STR%i" % (su[1]) if (
                    su[0] + l == ls) else self.rstr.array_str[su[1]][su[0] + l]
                set_right.add(char_right)

            self.assertNotEqual(len(set_left), 1)
            self.assertNotEqual(len(set_right), 1)

    def utest_left_maximality(self):
        r = self.rstr.go()
        #    for (idStr, end, nb), (l, start_plage) in r.iteritems():
        for (offset_end, nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.global_suffix[offset_end - l:offset_end]
            #      ss = self.rstr.array_str[idStr][end-l:end]
            offset_end -= 1
            id_chaine = self.rstr.idxString[offset_end]
            s = self.rstr.array_str[id_chaine]
            #      s = self.rstr.array_str[idStr]
            set_left_char = set()
            for o in range(start_plage, start_plage + nb):
                offset_global = self.rstr.res[o]
                su = (self.rstr.idxPos[offset_global],
                      self.rstr.idxString[offset_global])
                #        su = self.rstr.array_suffix[o]
                if (su[0] == 0):
                    char_left = "START_STR"
                else:
                    char_left = self.rstr.array_str[su[1]][su[0] - 1]
                set_left_char.add((char_left, su[1]))
            if (len(set_left_char) == 1):
                print
                print '*' * 10
                print set_left_char
                print ss.encode('utf-8')
                print '*' * 10
                print
            self.assertNotEqual(len(set_left_char), 1)

    def utest_right_maximality(self):
        r = self.rstr.go()
        for (offset_end, nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.global_suffix[offset_end - l:offset_end]
            offset_end -= 1
            id_chaine = self.rstr.idxString[offset_end]
            s = self.rstr.array_str[id_chaine]
            set_right_char = set()
            for o in range(start_plage, start_plage + nb):
                offset_global = self.rstr.res[o]
                su = (self.rstr.idxPos[offset_global],
                      self.rstr.idxString[offset_global])
                ls = len(self.rstr.array_str[su[1]])
                if (su[0] + l == ls):
                    char_right = "END_STR"
                else:
                    char_right = self.rstr.array_str[su[1]][su[0] + l]
                set_right_char.add((char_right, su[1]))

            if (len(set_right_char) == 1):
                print
                print '*' * 10
                print set_right_char
                print ss.encode('utf-8')
                print '*' * 10
                print
            self.assertNotEqual(len(set_right_char), 1)
 def setUp(self):
   self.s = self.getString()
   self.rstr = Rstr_max()
   self.rstr.add_str(self.s)
class Test_rstrmax:
  def setUp(self):
    self.s = self.getString()
    self.rstr = Rstr_max()
    self.rstr.add_str(self.s)

  def test_rstr_max(self) :
    r = self.rstr.go()
    for ((idStr, end), nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.array_str[idStr][end-l:end]
      s = self.rstr.array_str[idStr]
      idx = 0
      for i in xrange(nb):
        idx = s.index(ss, idx) + 1
#      except ValueError, e:
#        print "+++", ss, end, i, nb
#      try:
      self.assertRaises(ValueError, s.index, ss, idx)
#        print "***", ss, end, i, nb
#      except ValueError, e:
#        pass

  def test_left_maximality(self) :
    r = self.rstr.go()
    for ((idStr, end), nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.array_str[idStr][end-l:end]
      s = self.rstr.array_str[idStr]
      set_left_char = set()
      for o in range(start_plage, start_plage + nb) :
        su = self.rstr.array_suffix[o]
        if(su[0] == 0) :
          char_left = "START_STR"
        else :
          char_left = self.rstr.array_str[su[1]][su[0]-1]
        set_left_char.add(char_left)
      if(len(set_left_char) == 1) :
        print
        print '*'*10
        print set_left_char
        print ss.encode('utf-8')
        print '*'*10
        print
      self.assertNotEqual(len(set_left_char), 1)

  def test_right_maximality(self) :
    r = self.rstr.go()
    for ((idStr, end), nb), (l, start_plage) in r.iteritems():
      ss = self.rstr.array_str[idStr][end-l:end]
      s = self.rstr.array_str[idStr]
      set_right_char = set()
      for o in range(start_plage, start_plage + nb) :
        su = self.rstr.array_suffix[o]
        ls = len(self.rstr.array_str[su[1]])
        if(su[0]+l == ls) :
          char_right = "END_STR"
        else :
          char_right = self.rstr.array_str[su[1]][su[0]+l]
        set_right_char.add(char_right)
      if(len(set_right_char) == 1) :
        print
        print '*'*10
        print set_right_char
        print ss.encode('utf-8')
        print '*'*10
        print
      self.assertNotEqual(len(set_right_char), 1)
 def setUp(self):
     self.s = self.getString()
     self.rstr = Rstr_max()
     self.rstr.add_str(self.s)
class Test_rstrmax:
    def setUp(self):
        self.s = self.getString()
        self.rstr = Rstr_max()
        self.rstr.add_str(self.s)

    def test_rstr_max(self):
        r = self.rstr.go()
        for ((idStr, end), nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.array_str[idStr][end - l:end]
            s = self.rstr.array_str[idStr]
            idx = 0
            for i in xrange(nb):
                idx = s.index(ss, idx) + 1
#      except ValueError, e:
#        print "+++", ss, end, i, nb
#      try:
            self.assertRaises(ValueError, s.index, ss, idx)
#        print "***", ss, end, i, nb
#      except ValueError, e:
#        pass

    def test_left_maximality(self):
        r = self.rstr.go()
        for ((idStr, end), nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.array_str[idStr][end - l:end]
            s = self.rstr.array_str[idStr]
            set_left_char = set()
            for o in range(start_plage, start_plage + nb):
                su = self.rstr.array_suffix[o]
                if (su[0] == 0):
                    char_left = "START_STR"
                else:
                    char_left = self.rstr.array_str[su[1]][su[0] - 1]
                set_left_char.add(char_left)
            if (len(set_left_char) == 1):
                print
                print '*' * 10
                print set_left_char
                print ss.encode('utf-8')
                print '*' * 10
                print
            self.assertNotEqual(len(set_left_char), 1)

    def test_right_maximality(self):
        r = self.rstr.go()
        for ((idStr, end), nb), (l, start_plage) in r.iteritems():
            ss = self.rstr.array_str[idStr][end - l:end]
            s = self.rstr.array_str[idStr]
            set_right_char = set()
            for o in range(start_plage, start_plage + nb):
                su = self.rstr.array_suffix[o]
                ls = len(self.rstr.array_str[su[1]])
                if (su[0] + l == ls):
                    char_right = "END_STR"
                else:
                    char_right = self.rstr.array_str[su[1]][su[0] + l]
                set_right_char.add(char_right)
            if (len(set_right_char) == 1):
                print
                print '*' * 10
                print set_right_char
                print ss.encode('utf-8')
                print '*' * 10
                print
            self.assertNotEqual(len(set_right_char), 1)