Ejemplo n.º 1
0
def pointwise_test_features(test_data_file, is_pairwise=False):
  (queries, features) = extractFeatures(test_data_file)

  # Bulid IDF dictionary
  idfDict = getIDFScores()

  queryStrings = []
  X = []
  index_map = {}

  for query in queries.keys():
    queryStrings.append(query)
    queryTerms = query.rsplit()
    queryVector = collections.defaultdict(lambda: 0)

    for queryTerm in queryTerms:
      queryTerm = queryTerm.lower()
      queryVector[queryTerm] = queryVector[queryTerm] + 1

    for key in queryVector:
      if key in idfDict:
        queryVector[key] = queryVector[key] * idfDict[key]
      else:
        queryVector[key] = queryVector[key] * log(98998)


    results = queries[query]
    for d, document in enumerate(results):
      X_i = [] 
      rawCounts = getRawCounts(queries, features, query, document)
      if 'body_length' in features[query][document]:
        normalizedBodyLength = features[query][document]['body_length'] + 500
      else:
        normalizedBodyLength = 500

      for j in range(len(rawCounts)):
        currentCounts = rawCounts[j]
        currentValue = 0
        for term in queryVector:
          currentValue += queryVector[term] * currentCounts[term]
        X_i.append(currentValue)

      X.append(X_i)

      if query in index_map:
        # index_map[query][url] = i means X[i] is the feature vector of query and url
        index_map[query][document] = len(X) - 1
      else:
        index_map[query] = {}
        index_map[query][document] = len(X) - 1

  if is_pairwise:
    X = preprocessing.scale(X)

  return (X, queryStrings, index_map)
Ejemplo n.º 2
0
def pointwise_test_features(test_data_file, is_pairwise=False):
    (queries, features) = extractFeatures(test_data_file)

    # Bulid IDF dictionary
    idfDict = getIDFScores()

    queryStrings = []
    X = []
    index_map = {}

    for query in queries.keys():
        queryStrings.append(query)
        queryTerms = query.rsplit()
        queryVector = collections.defaultdict(lambda: 0)

        for queryTerm in queryTerms:
            queryTerm = queryTerm.lower()
            queryVector[queryTerm] = queryVector[queryTerm] + 1

        for key in queryVector:
            if key in idfDict:
                queryVector[key] = queryVector[key] * idfDict[key]
            else:
                queryVector[key] = queryVector[key] * log(98998)

        results = queries[query]
        for d, document in enumerate(results):
            X_i = []
            rawCounts = getRawCounts(queries, features, query, document)
            if 'body_length' in features[query][document]:
                normalizedBodyLength = features[query][document][
                    'body_length'] + 500
            else:
                normalizedBodyLength = 500

            for j in range(len(rawCounts)):
                currentCounts = rawCounts[j]
                currentValue = 0
                for term in queryVector:
                    currentValue += queryVector[term] * currentCounts[term]
                X_i.append(currentValue)

            X.append(X_i)

            if query in index_map:
                # index_map[query][url] = i means X[i] is the feature vector of query and url
                index_map[query][document] = len(X) - 1
            else:
                index_map[query] = {}
                index_map[query][document] = len(X) - 1

    if is_pairwise:
        X = preprocessing.scale(X)

    return (X, queryStrings, index_map)
Ejemplo n.º 3
0
def pointwise_train_features(train_data_file, train_rel_file):
  (queries, features) = extractFeatures(train_data_file)

  # Bulid IDF dictionary
  idfDict = getIDFScores(log = True)

  trainScores = getTrainScores(train_rel_file)

  X = []
  Y = []

  for query in queries.keys():
    queryTerms = query.rsplit()
    queryVector = collections.defaultdict(lambda: 0)

    for queryTerm in queryTerms:
      queryTerm = queryTerm.lower()
      queryVector[queryTerm] = queryVector[queryTerm] + 1

    for key in queryVector:
      if key in idfDict:
        queryVector[key] = queryVector[key] * idfDict[key]
      else:
        queryVector[key] =  queryVector[key] * log(98998)


    results = queries[query]
    for d, document in enumerate(results):
      X_i = [] 
      # extract raw counts and apply sublinear scaling
      rawCounts = getRawCounts(queries, features, query, document)
      normalizedBodyLength = features[query][document]['body_length'] + 500

      for j in range(len(rawCounts)):
        currentCounts = rawCounts[j]
        currentValue = 0
        for term in queryVector:
          currentValue += queryVector[term] * currentCounts[term]
        X_i.append(currentValue)
      X.append(X_i)
      Y.append(trainScores[query][document])

  return (X, Y)
Ejemplo n.º 4
0
def pointwise_train_features(train_data_file, train_rel_file):
    (queries, features) = extractFeatures(train_data_file)

    # Bulid IDF dictionary
    idfDict = getIDFScores(log=True)

    trainScores = getTrainScores(train_rel_file)

    X = []
    Y = []

    for query in queries.keys():
        queryTerms = query.rsplit()
        queryVector = collections.defaultdict(lambda: 0)

        for queryTerm in queryTerms:
            queryTerm = queryTerm.lower()
            queryVector[queryTerm] = queryVector[queryTerm] + 1

        for key in queryVector:
            if key in idfDict:
                queryVector[key] = queryVector[key] * idfDict[key]
            else:
                queryVector[key] = queryVector[key] * log(98998)

        results = queries[query]
        for d, document in enumerate(results):
            X_i = []
            # extract raw counts and apply sublinear scaling
            rawCounts = getRawCounts(queries, features, query, document)
            normalizedBodyLength = features[query][document][
                'body_length'] + 500

            for j in range(len(rawCounts)):
                currentCounts = rawCounts[j]
                currentValue = 0
                for term in queryVector:
                    currentValue += queryVector[term] * currentCounts[term]
                X_i.append(currentValue)
            X.append(X_i)
            Y.append(trainScores[query][document])

    return (X, Y)
Ejemplo n.º 5
0
def pairwise_train_features(train_data_file, train_rel_file):
  (queries, features) = extractFeatures(train_data_file)

  # Bulid IDF dictionary
  idfDict = getIDFScores()

  trainScores = getTrainScores(train_rel_file)

  X = []
  Y = []

  # Associates each query/doc pair with an index into the scaled feature matrix
  featureIndex = {}
  featuresBeforeScaling = []

  for query in queries.keys():
    featureIndex[query] = {} 
    queryTerms = query.rsplit()
    queryVector = collections.defaultdict(lambda: 0)

    for queryTerm in queryTerms:
      queryTerm = queryTerm.lower()
      queryVector[queryTerm] = queryVector[queryTerm] + 1

    for key in queryVector:
      if key in idfDict:
        queryVector[key] = queryVector[key] * idfDict[key]
      else:
        queryVector[key] =  queryVector[key] * log(98998)


    results = queries[query]
    for d, document in enumerate(results):
      featuresBeforeScaling_i = [] 
      # extract raw counts and apply sublinear scaling
      rawCounts = getRawCounts(queries, features, query, document)
      normalizedBodyLength = features[query][document]['body_length'] + 500

      for j in range(len(rawCounts)):
        currentCounts = rawCounts[j]
        currentValue = 0
        for term in queryVector:
          currentValue += queryVector[term] * currentCounts[term]
        featuresBeforeScaling_i.append(currentValue)
      featuresBeforeScaling.append(featuresBeforeScaling_i)
      featureIndex[query][document] = len(featuresBeforeScaling) - 1


  features = preprocessing.scale(featuresBeforeScaling)

  for query in queries.keys():
    results = queries[query]
    for d1, document1 in enumerate(results):
      X_d1 = features[featureIndex[query][document1]]

      for d2, document2 in enumerate(results[d1+1:]):
        X_d2 = features[featureIndex[query][document2]]
        
        d1Score = trainScores[query][document1]
        d2Score = trainScores[query][document2]

        if d1Score != d2Score:
          if d1Score > d2Score:
            val = 1
          else:
            val = -1
          X_i = [x1 - x2 for x1, x2 in zip(X_d1, X_d2)]
          X.append(X_i)
          Y.append(val)

  return (X, Y)
Ejemplo n.º 6
0
def pairwise_train_features(train_data_file, train_rel_file):
    (queries, features) = extractFeatures(train_data_file)

    # Bulid IDF dictionary
    idfDict = getIDFScores()

    trainScores = getTrainScores(train_rel_file)

    X = []
    Y = []

    # Associates each query/doc pair with an index into the scaled feature matrix
    featureIndex = {}
    featuresBeforeScaling = []

    for query in queries.keys():
        featureIndex[query] = {}
        queryTerms = query.rsplit()
        queryVector = collections.defaultdict(lambda: 0)

        for queryTerm in queryTerms:
            queryTerm = queryTerm.lower()
            queryVector[queryTerm] = queryVector[queryTerm] + 1

        for key in queryVector:
            if key in idfDict:
                queryVector[key] = queryVector[key] * idfDict[key]
            else:
                queryVector[key] = queryVector[key] * log(98998)

        results = queries[query]
        for d, document in enumerate(results):
            featuresBeforeScaling_i = []
            # extract raw counts and apply sublinear scaling
            rawCounts = getRawCounts(queries, features, query, document)
            normalizedBodyLength = features[query][document][
                'body_length'] + 500

            for j in range(len(rawCounts)):
                currentCounts = rawCounts[j]
                currentValue = 0
                for term in queryVector:
                    currentValue += queryVector[term] * currentCounts[term]
                featuresBeforeScaling_i.append(currentValue)
            featuresBeforeScaling.append(featuresBeforeScaling_i)
            featureIndex[query][document] = len(featuresBeforeScaling) - 1

    features = preprocessing.scale(featuresBeforeScaling)

    for query in queries.keys():
        results = queries[query]
        for d1, document1 in enumerate(results):
            X_d1 = features[featureIndex[query][document1]]

            for d2, document2 in enumerate(results[d1 + 1:]):
                X_d2 = features[featureIndex[query][document2]]

                d1Score = trainScores[query][document1]
                d2Score = trainScores[query][document2]

                if d1Score != d2Score:
                    if d1Score > d2Score:
                        val = 1
                    else:
                        val = -1
                    X_i = [x1 - x2 for x1, x2 in zip(X_d1, X_d2)]
                    X.append(X_i)
                    Y.append(val)

    return (X, Y)